### A Pluto.jl notebook ###
# v0.19.14

using Markdown
using InteractiveUtils

# This Pluto notebook uses @bind for interactivity. When running this notebook outside of Pluto, the following 'mock version' of @bind gives bound variables a default value (instead of an error).
macro bind(def, element)
    quote
        local iv = try Base.loaded_modules[Base.PkgId(Base.UUID("6e696c72-6542-2067-7265-42206c756150"), "AbstractPlutoDingetjes")].Bonds.initial_value catch; b -> missing; end
        local el = $(esc(element))
        global $(esc(def)) = Core.applicable(Base.get, el) ? Base.get(el) : iv(el)
        el
    end
end

# ╔═╡ 6b9e49ac-f2f7-11ed-3e20-614d6f3d4ca3
begin
	import Pkg
	Pkg.activate(Base.current_project())
	Pkg.instantiate()
end

# ╔═╡ 562b755a-8ba6-4baf-bb8c-20d626556459
using PlutoUI

# ╔═╡ 085557db-7b8e-4210-85c2-135d71eb7fbc
using DataFrames, CSV

# ╔═╡ 8e09926c-73d7-42c8-ad7e-c7310ed2133a
using SQLite

# ╔═╡ 425a6927-a0b8-4d64-bdc3-874ccac9654e
using YAML

# ╔═╡ f91e7737-b802-423f-8a17-080f72d847bb
using Statistics

# ╔═╡ 8f1229ea-5bab-41c9-9df3-297b003aaaae
using Printf

# ╔═╡ 3464c7e7-a4fd-4a68-988e-10e5c46f4963
begin
	using AlgebraOfGraphics, CairoMakie

	#set_aog_theme!()
	#set_theme!() # reset

	struct BytesTicks end

	function Makie.get_minor_tickvalues(
		::BytesTicks,
		::Union{typeof(log),typeof(log10),typeof(log2)},
		tickvalues,
		vmin, vmax,
	)
		n = length(tickvalues)
		[a*tickvalues[i] for i in 1:n-1 for a in (10, 100,)]
	end

	function Makie.get_ticks(
		::BytesTicks,
		::Union{typeof(log),typeof(log10),typeof(log2)},
		::Makie.Automatic,
		vmin, vmax,
	)
		emin = floor(Int, log(1024, vmin))
		emax = ceil(Int, log(1024, vmax))

		values = [1024^e for e in emin:emax]
		unit = ["B", "KiB", "MiB", "GiB", "TiB"]
		labels = ["1 $(unit[e+1])" for e in emin:emax]

		values, labels
	end
end

# ╔═╡ 1dfc1459-3f2d-4e98-9fc9-ea60e6ae6118
md"""
# DAMN SpMV
"""

# ╔═╡ 3f530c87-08e8-466b-b07b-3ea4e9ce8cea
TableOfContents()

# ╔═╡ ec06f9d6-53b3-455b-b46d-ed03dc29786f
md"""
# Load Benchmark Measurements
"""

# ╔═╡ 174e38d2-519f-4e8c-8465-03af554ba397
md"""
Data directory (`path`):
$(@bind path confirm(TextField(default=pwd())))
"""

# ╔═╡ 53acb72c-876e-4a2c-ad8d-c8d88fbfaf0b
abspath(path)

# ╔═╡ 928d7a42-3e91-4c69-b1ef-6877ade1e95f
function read_spmv_data(path...)
	spmv_dir = joinpath(path..., "spmv")
	df = DataFrame()
	for (root, dirs, files) in walkdir(spmv_dir)
		for f in files
			endswith(f, ".csv") || continue
			df2 = CSV.read(
				joinpath(root, f),
				DataFrame,
				delim = ", ",
				#pool = true,
			)
			append!(df, df2, cols=:union)
		end
	end
	df
end

# ╔═╡ 23614ac3-8122-4817-9fa2-44236206bd3c
df_raw = read_spmv_data(path, "no-omp");

# ╔═╡ c3fb960c-5f3e-4d41-b981-df3eb2951883
md"""
## Matrix Metadata
"""

# ╔═╡ 27f0deec-fc59-43e1-8576-9ae9901a6db7
UF_COLLECTION_CACHE_DIR = get(ENV, "UF_COLLECTION_CACHE_DIR", "~/.ufget")

# ╔═╡ 40b76c0c-eb34-4452-a4ed-5d7b4e2882de
db_path = joinpath(UF_COLLECTION_CACHE_DIR, "ufmatrices.db")

# ╔═╡ 6cf42151-ad9b-4de8-976c-ac25659e595e
db = SQLite.DB(db_path)

# ╔═╡ ca2bf257-8ddd-4973-9161-291d0b7f7ad0
context = DataFrame(DBInterface.execute(db, """
	SELECT
		group_name || '/' || name AS A_name,
		nrows AS A_nrows,
		nnz + nzeros AS A_nnz,
		CAST(nnz + nzeros AS float) / nrows AS "A_nnz/nrows"
	FROM matrices
"""))

# ╔═╡ b311ecac-e6cb-4d91-8900-22a3833f44f5
md"""
## Hardware Metadata

### Cache Sizes
Parse the `info.yaml` within the selected data directory.
"""

# ╔═╡ 3a9bd51d-05f9-41a7-967c-e6b2f12747af
info = YAML.load_file(joinpath(path, "no-omp", "info.yaml"))

# ╔═╡ e7d34afe-1a45-4a9a-b500-fd8715eec20d
cache_levels = ["L1d", "L2", "L3", "L4"]

# ╔═╡ 039b68da-41e7-4ba5-bad0-00a6213a801b
cache = let
	df = DataFrame(
		level = cache_levels,
		size = [
			something(get(info["cache"], l, nothing), 0) for l in cache_levels
		],
	)
	subset!(df, :size => ByRow(>(0)))
end


# ╔═╡ f70ea745-b234-45a1-91ca-cfff6c9f4b98
md"""
### Memory Bandwidth

Inspect `triad` and `prod` benchmarks to obtain the hardware memory bandwidth depending on the traffic involved.
In comparison to the SpMV product, these benchmarks perform linear memory accesses,
which means that the measured throughputs are upper bounds to the ones of the SpMV product.
"""

# ╔═╡ a19bfbbc-24f9-4b2d-a238-5fcf0332822d
md"""
!!! info
	The way I measure the memory bandwidth doesn't seem to scale when using OpenMP.
	I would expect the relative throughput to be bound by 100%,
	but my actual measurements show a plateau at about 120%.
	In addition, for traffic within the CPU's L2 cache (1 MiB),
	the runtime of the Triad (and Prod) benchmarks are dominated by the overhead of spawning additional worker threads.
	Thus, the corresponding measured throughput stands in no relation to the hardware's capabilities.

	For these reasons, the relative throughput remains reported with respect to the single-threaded measurements.
	On-chip, i.e. with traffic up to the size of the L3 cache (11 MiB LLC),
	the relative throughput should hypothetically be limited by the number of threads.
	Off-cip, i.e. with traffic beyond that,
	the relative throughput should be limited by the number of memory channels (6 in this case).
"""

# ╔═╡ 0d2eecd5-c360-4f2d-b086-d93069a02b86
time_estimator = :min_elapsed_s
#time_estimator = :med_elapsed_s

# ╔═╡ fbfa40d4-0301-4016-8247-e90b7c6029db
delim = ", "

# ╔═╡ e3bf2756-f8c3-4030-96c4-a918f65ffdf0
df_throughput = let
	df1 = CSV.read(joinpath(path, "no-omp", "triad.csv"), DataFrame; delim)
	df2 = CSV.read(joinpath(path, "no-omp", "prod.csv"), DataFrame; delim)
	df1.description = @. df1.description * " triad"
	df2.description = @. df2.description * " prod"
	df = vcat(df1, df2)
	df."throughput_GiB/s" = df.traffic_GiB ./ df[!, time_estimator]
	df_best = combine(
		groupby(df, :traffic_h),
		"throughput_GiB/s" => maximum => "throughput_GiB/s",
	)
	df = innerjoin(df, df_best, on=names(df_best))
	df.traffic_B = df.traffic_GiB * 1024^3
	select!(df, Cols(["description", "traffic_h", "traffic_B", "throughput_GiB/s"]))
	sort!(df, :traffic_B)
	df
end

# ╔═╡ b2fd274f-ae8f-4a1e-a42c-2c9c583ceb41
md"""
## Combine Measurements and Metadata
"""

# ╔═╡ f755bf50-420c-4530-8ed2-8cdaefd9a396
let
	df_small = subset(
		df_raw,
		:traffic_B => ByRow(<(1000)),
		:impl_desc => ByRow(==("naive")),
		:A_format => ByRow(==("CSR")),
	)
	df_small_max = combine(
		groupby(df_small, :A_name),
		:traffic_B => maximum,
		renamecols = false,
	)
	innerjoin(df_small, df_small_max, on=names(df_small_max))
end

# ╔═╡ ec3f483e-3a52-4408-9202-b33f8b5fa116
md"""
Exclude traffic below 1 KiB $(@bind exclude_sub_1KiB CheckBox(default=true))
"""

# ╔═╡ 9ed53f3e-1c77-4698-abf4-31e64f8657a7
if exclude_sub_1KiB
	md"""
	The data frame above does not include matrices
	that incur a traffic below 1 KiB for at least one of their implementations
	(`exclude_sub_1KiB` is `true`).
	The DAMN implementations perform unexpectedly well for these matrices,
	which would skew all plots showing relative performances, e.g. DAMN vs MKL.
	The following matrices are affected.
	"""
else
	md"""
	The data frame above contains all measurements,
	including the smallest ones having a traffic below 1 KiB
	(`exclude_sub_1KiB` is `false`).
	The following matrices would be affected.
	"""
end

# ╔═╡ c03d86d1-8cb3-42dc-b0bf-5f74f43c34ab
"""
Compute a human-readable description of the amount of traffic, e.g. `"50 KiB"` for 42,000 or 43,008 byte of traffic.
"""
function traffic_h(bytes)
	for (unit, k) in [("KiB", 1024), ("MiB", 1024^2)]
		if bytes <= 9k
			n = k
			bin = (bytes + n - 1) ÷ n
			return "$bin $unit"
		elseif bytes <= 90k
			n = 10k
			bin = (bytes + n - 1) ÷ n
			return "$(bin)0 $unit"
		elseif bytes <= 900k
			n = 100k
			bin = (bytes + n - 1) ÷ n
			return "$(bin)00 $unit"
		end
	end
	return "1 GiB"
end

# ╔═╡ 0301ffdd-acda-42eb-acbe-1e6c6a2912e1
function impl_summary(r::DataFrameRow)
	r.impl_vendor == "MKL" && return "MKL $(r.impl_desc)"
	r.impl_vendor == "Eigen" && return "Eigen $(r.impl_desc)"

	@assert r.impl_vendor == "DAMN"
	r.impl_desc == "naive" && return "naive"
	nacc = r.impl_nacc
	r.impl_desc == "simple" && return "$nacc acc."
	r.impl_desc == "Eigen-alike" && return "Eigen-alike"

	@assert startswith(r.impl_desc, "xsimd")
	desc = replace(r.impl_desc, "xsimd " => "")
	return "$nacc vector-acc.\n($desc)"
end

# ╔═╡ d3c8100f-d709-4303-873f-15afd5171b25
shortname(s) = s[end-1:end]

# ╔═╡ 9445b409-3575-4c61-bdd8-871a4c796bdd
function slide_desc(r::DataFrameRow)
	r.impl_vendor == "MKL" && return "MKL $(r.impl_desc)"
	r.impl_vendor == "Eigen" && return "Eigen $(r.impl_desc)"

	@assert r.impl_vendor == "DAMN"
	r.impl_desc == "naive" && return "naive"
	nacc = r.impl_nacc
	r.impl_desc == "simple" && return "multiple accumulators"
	r.impl_desc == "Eigen-alike" && return "Eigen-alike"

	@assert startswith(r.impl_desc, "xsimd")
	return "explicit vectorization"
end

# ╔═╡ b6fd7219-b58d-4040-8d26-592e8577b8ef
"""
Augment measurements with metadata:

* `perf_FLOP/s`
* `throughput_GiB/s`
* `variant`: abbreviation of scalar type information, e.g. `A64_x64_y64`
* `traffic_h`: human-readable traffic bins, e.g. `40 KiB`
* `throughput_relative`: achieved throughput divided by memory bandwidth of respective `traffic_h` bin
* `nthreads`: number of threads (default: 1)
* `impl_summary_full`: summary of implementation details usable for plot labels
* `cache_level`: human-readable traffic bins, e.g. `L1` or `Large`
* `slide_desc`: brief implementation summary
"""
function add_metadata(df_raw, nthreads=1; exclude_sub_1KiB::Bool)
	df = leftjoin(df_raw, context, on=:A_name)
	df.work_FLOP = @. 2 * df.A_nnz + 2 * df.A_nrows
	df."perf_FLOP/s" = df.work_FLOP ./ df[!, time_estimator]
	df."throughput_GiB/s" = @. df.traffic_B / df[!, time_estimator] / 1024^3
	df.variant = map(df.A_scalar, df.X_scalar, df.Y_scalar) do A, X, Y
		#"$(shortname(A)) × $(shortname(X)) → $(shortname(Y))"
		"A$(shortname(A))_x$(shortname(X))_y$(shortname(Y))"
	end
	df.traffic_h = DataFrames.PooledArray(@. traffic_h(df.traffic_B))
	df = leftjoin(
		df,
		df_throughput[!, ["traffic_h", "throughput_GiB/s"]],
		on=:traffic_h,
		renamecols=""=>"_max",
	)
	df.throughput_relative = @. df."throughput_GiB/s" / df."throughput_GiB/s_max"
	df.nthreads .= nthreads
	df.impl_summary_full = impl_summary.(eachrow(df))
	df.slide_desc = slide_desc.(eachrow(df))

	# Compute cache bins
	df.cache_level .= "Large"
	for (level, size) in eachrow(sort(cache, :size, rev=true))
		df.cache_level[df.traffic_B .<= size] .= level
	end

	# There are a few tiny matrices that skew all relative metrics --
	# in our favor, but still.
	if exclude_sub_1KiB
		subset!(df, :traffic_B => ByRow(>(1000)))
	end
	df
end

# ╔═╡ 8dc6fdb4-e69a-4b56-8f97-1abe9c39e24a
df = add_metadata(df_raw; exclude_sub_1KiB)

# ╔═╡ 504da979-4eb3-4440-a10e-b1dcaf8f0297
levels(df.variant)

# ╔═╡ a06b7b6e-1a47-491e-b50c-72cf2545a2c6
A_scalar_bits = 64

# ╔═╡ c79796ed-bb85-44c1-a28e-e0b6b9a51d18
variant = "A$(A_scalar_bits)_x64_y64"

# ╔═╡ 3f91f320-97df-4af9-a5ba-9a4ef9c0b5d3
md"""
# Efficient CSR Implementation

Before comparing any measurements against DA-CSR,
we have to establish a good/fair CSR baseline.
Let's investigate CSR performance using 32-bit indices as well as 64-bit scalars throughout (originating from 'bench/spmv_$variant').
"""

# ╔═╡ cae1ff23-5c6d-4eb4-a222-86ae32d629e2
function select_damn_csr(df, iindex...)
	subset(
		df,	:A_format => ByRow(==("CSR")),
		:A_oindex => ByRow(==("int32")),
		:A_iindex => ByRow(in(iindex)),
		:variant => ByRow(==(variant)),
		:impl_vendor => ByRow(==("DAMN")),
	)
end

# ╔═╡ b504291f-f784-415a-bbcc-55687817be11
df_csr32 = select_damn_csr(df, "int32");

# ╔═╡ ea2de365-df5e-41fa-800f-3e54e762780a
df_csr_mkl = subset(
	df,
	:variant => ByRow(==(variant)),
	:impl_vendor => ByRow(==("MKL")),
);

# ╔═╡ 869f9809-176d-497d-b2a9-aa60b6a099d4
md"""
Without anticipating too much,
here is an overview about *all* implementations included in the data set.
In the following sections we will explain them in more detail.
"""

# ╔═╡ 715f116b-3e3d-49f6-bac5-4fc0de19976c
impls = unique(df[!, Cols(:variant, :impl_desc, :impl_nacc, :impl_vendor)])

# ╔═╡ 4126074c-2627-46b7-b69c-4153341f571d
mkl_version = first(impls[impls.impl_vendor .== "MKL", :impl_desc])

# ╔═╡ 1a137de6-c1b5-4a9e-bfa3-4fde39520198
md"""
## Single-Threaded SpMV
### Naive Implementation
"""

# ╔═╡ f95c5d6e-ef6a-4437-8d88-01986eabc8fd
impl_naive = subset(
	impls,
	:impl_desc => ByRow(==("naive")),
	:variant => ByRow(==(variant)),
)

# ╔═╡ 39d8f5b5-84e8-410f-923f-44d72318da89
md"""
That plot is quite hard to digest.
How good is this compared to Intel's MKL ($mkl_version)?
"""

# ╔═╡ 9cad5ef7-e0e9-4a6f-8bd0-1b6717ce52cb
md"Pretty bad ... so lets see how far we can improve this."

# ╔═╡ facbb67c-de04-4cc8-9d4f-fedf3d4ba079
try
	df_tiny = subset(df, :traffic_B => ByRow(<(1000)))
	combine(
		groupby(df_tiny, :A_name),
		:A_bandwidth => first => :A_bandwidth,
		:A_nnz => first => :A_nnz,
		:A_nrows => first => :A_nrows,
		:traffic_B => maximum,
	)
catch
end

# ╔═╡ 7efd75e6-2c6a-4a6a-bdfb-7f63fcf9c729
"""
Compute performance per matrix relative to a given baseline.
"""
function normalize(df_candidate, df_baseline)
	@assert levels(df_baseline.A_name) ⊇ levels(df_candidate.A_name)
	cols = ["A_name", "variant"]
	df = leftjoin(
		df_candidate,
		df_baseline[!, vcat(cols, "perf_FLOP/s")],
		on=cols,
		renamecols=""=>"_baseline",
	)
	df[!, "perf_relative"] = df."perf_FLOP/s" ./ df."perf_FLOP/s_baseline"
	select(df, Not("perf_FLOP/s_baseline"))
end

# ╔═╡ 348689a4-bfc8-46d0-ae83-7e06b13d8902
"Select measurements within dataframe for given implementation."
select_impl(df, impl) = innerjoin(df, impl, on=names(impl))

# ╔═╡ 183d6803-1f79-4206-ba88-afc314b8eb42
df_csr32_naive = let
	df = select_impl(df_csr32, impl_naive)
	df.slide_desc .= "naive"
	df
end

# ╔═╡ dd21e13d-3440-43ca-8e3e-cd015178d3fb
md"""
### Multiple Accumulators

Modern CPUs exhibit [instruction pipelining](https://en.wikipedia.org/wiki/Instruction_pipelining).
To leverage this, subsequent instructions need to be sufficiently independent from one another.
In the context of the SpMV product, this can be achieved by introducing multiple accumulator variables inside every row's inner product.

Example: two accumulators for a single row `row`.
In C:

```c
scalar_t acc1 = 0, acc2 = 0;
oindex_t i = oindex[row], i_end = oindex[row+1];
// Process leading elements, as many as accumulators at a time:
for (i + 1 < i_end; i += 2) {
  iindex_t col1 = iindex[i], col2 = iindex[i+1];
  scalar_t val1 = values[i], val2 = values[i+1];
  acc1 += val1 * x[col1];
  acc2 += val2 * x[col2];
}
// Process remaining elements, if any:
if (i < i_end) {
  acc1 += values[i] * x[iindex[i]];
}
y[row] = alpha * (acc1 + acc2) + beta * y[row];
```

Or, equivalently, in "math" if you are more fluent in that language:

```math
y_{\text{row}} =
\alpha \cdot \left( \sum_{i=0,2,4,...} + \sum_{i=1,3,5,...}\right)
(a_{\text{row},\text{col}_i} \cdot x_{\text{col}_i}) +
\beta \cdot y_{\text{row}}
```
"""

# ╔═╡ 293885af-8657-4090-b087-c1b98dc6fba4
df_csr32_acc = subset(
	df_csr32,
	:impl_desc => ByRow(==("simple")),
	#:impl_nacc => ByRow(<=(4)),
	:impl_nacc => ByRow(in([1,2,3,4,6,8])),
)

# ╔═╡ 3e50bb1a-83f1-47db-bf89-dd2c0aadba48
md"""
How do these implementations perform versus one another?
"""

# ╔═╡ 2637e32a-0c6e-4a92-aef9-297e41016692
md"""
Diagonal matrices perform poorly when using multiple accumulators.
For all the other matrices,
the scatter plots seem to suggest that the 3-accumulator implementation performs better than all the others more often than not.
Comparing the 3- versus 4-accumulator implementations shows a slight favor for the 3-accumulator one.
This is supported when taking the fastest implementation per matrix
and counting the implementations:
"""

# ╔═╡ 51189ea0-e454-4709-aebb-5203c22d98b3
df_csr32_acc_and_naive = let
	df = copy(df_csr32_acc)
	df.slide_desc .= "multiple accumulators"
	append!(df, df_csr32_naive)
	df.impl_summary = df.impl_summary_full
	df
end;

# ╔═╡ 669b9ce3-ab39-4999-b976-76e3f6a729af
md"""
We could naively select the implementation that has been the fastest one for the most matrices, i.e. the first row of the previous data frame.
However, a less frequent implementation could lead to a better performance than a more frequent one.
Thus, eliminate the least frequent implementations and take the next-best implementation for the affected matrices, then recurse.
This procedure yields the following implementation.
"""

# ╔═╡ cc538aeb-448d-43c7-93a2-724876695adf
md"""
How big is the improvement over the naive implementation?
"""

# ╔═╡ b37bf6e3-d412-43fc-9058-99710ff4d014
"""
Compute average performance uplift per cache level.
"""
function overview_labels(df_candidate, df_baseline)
	df = normalize(df_candidate, df_baseline)
	df = combine(groupby(df, :cache_level), :perf_relative => mean)
	df.label = ["$(@sprintf "%+2.1f" 100(p - 1))%" for p in df.perf_relative_mean]
	sort!(df, :cache_level)
end

# ╔═╡ eaa4e5fb-f7a3-4965-aa55-4f7d63456529
"""
Find best implementation per matrix (and variant).
"""
function find_best(df)
	gdf = groupby(df, [:A_name, :variant])
	best = combine(gdf, :min_elapsed_s => minimum => :min_elapsed_s)
	innerjoin(df, best, on=names(best))
end

# ╔═╡ 35b2ffa3-76f6-405d-b9d9-9e2ff2b16453
"""
Recursively eliminate the worst implementation, until only `n` are left.
"Worst" in this case refers to "being the least-frequent best implementation".
"""
function find_best_impl(df, n; keepcount=false)
	cols = [:variant, :impl_desc, :impl_nacc, :impl_vendor]
	local df_desc
	while true
		df_best = find_best(df)
		gdf = groupby(df_best, cols)
		df_desc = combine(gdf, :A_name => length => :count)
		length(gdf) > n || break
		id_worst = argmin(df_desc.count)
		worst = df_desc[[id_worst], Not(:count)]
		df = antijoin(df, worst, on=names(worst))
	end
	sort!(df_desc, [order(:variant), order(:count, rev=true)])
	keepcount || select!(df_desc, Not(:count))
	df_desc
end

# ╔═╡ b111fb26-adb7-4b60-9f36-06771b587f5e
impl_acc = let
	df = find_best_impl(df_csr32_acc, 1)
	append!(df, impl_naive)
end

# ╔═╡ 2a405e0c-dd41-4acc-a7e6-6d293ce82070
df_csr32_acc_best = let
	df = find_best(select_impl(df_csr32, impl_acc))
	df.slide_desc .= "naive"
	df.slide_desc[df.impl_desc .== "simple"] .= "multiple accumulators"
	df
end

# ╔═╡ e3d7093f-51eb-45cb-a75f-8fafb63823e8
df_slides_acc_vs_naive = overview_labels(df_csr32_acc_best, df_csr32_naive)

# ╔═╡ a3f9b853-4ce7-42dc-8090-5f5958124942
"""
Count matrices grouped by implementation.
"""
function count_impl(df, cols...)
	cols = unique([:variant, :impl_desc, :impl_nacc, cols...])
	gdf = groupby(df, cols)
	df_desc = combine(gdf, :A_name => length => :count)
	sort!(df_desc, [order(:variant), order(:count, rev=true)])
end

# ╔═╡ f653cb80-980a-4159-b84f-f59e6698b765
count_impl(find_best(df_csr32_acc_and_naive))

# ╔═╡ efcf31b4-08c2-4bb8-8b09-ed9606135061
count_impl(df_csr32_acc_best)

# ╔═╡ fd0bb4e4-2eec-48c1-b4a4-7be4aa8e7eeb
md"""
### Explicit SIMD Vectorization

Under somewhat rare circumstances, the compiler is able to vectorize certain operations (requires e.g. loads from aligned memory locations, disjoint arrays).
For the generic SpMV product, this is almost never the case.
Therefore, use [xsimd](https://xsimd.readthedocs.io/en/latest/) to help the compiler.
"""

# ╔═╡ 7da59d41-918c-4875-a1a7-43c5f07e2d0a
#df_csr32_xsimd_baseline = df_csr32_naive;
df_csr32_xsimd_baseline = df_csr32_acc_best;

# ╔═╡ 5d49aee6-7ce3-465a-baa2-18e07585cd5c
df_csr32_xsimd = subset(
	df_csr32,
	#:impl_desc => ByRow(s -> startswith(s, "xsimd") && endswith(s, "aligned")),
	:impl_desc => ByRow(startswith("xsimd")),
	#:impl_nacc => ByRow(in([1, 2])),
)

# ╔═╡ 7bd36293-02e0-4b79-8813-5552abfd8f7e
md"""
For this particular hardware, the "naive" SIMD implementations perform best.
These "unaligned" implementations do not perform any loop-peeling to allow loads from aligned memory locations, or use more than one (SIMD) accumulator.
Still, lets eventually select only the single best candidate implementation.
"""

# ╔═╡ 42236bfb-9dd7-4bb2-b0cc-018a662f3909
df_csr32_xsimd_and_baseline = let
	df1 = copy(df_csr32_xsimd_baseline)
	df2 = copy(df_csr32_xsimd, copycols=false)
	df1.impl_summary .= "baseline"
	df2.impl_summary = df2.impl_summary_full
	append!(df1, df2)
	df1
end;

# ╔═╡ 2cfb4fde-2e53-4d08-b6e3-a6b82cab5209
count_impl(find_best(df_csr32_xsimd_and_baseline))

# ╔═╡ 5057c526-f0cc-496b-bf9d-fa446bba92e2
find_best_impl(df_csr32_xsimd, 2, keepcount=true)

# ╔═╡ a57ead15-3c91-4caa-8005-cfd0f42dc154
impl_xsimd = let
	impl = find_best_impl(df_csr32_xsimd, 1)
	append!(impl, impl_acc)
end

# ╔═╡ 20db2172-169e-42bc-8e03-649a19272e2a
df_csr32_xsimd_best = find_best(select_impl(df_csr32, impl_xsimd));

# ╔═╡ c884f3d8-29ff-47fd-81f1-8995f95cc25a
count_impl(df_csr32_xsimd_best)

# ╔═╡ 7a451dfb-7c2c-403e-a48b-c99776cddd45
md"""
How does this compare to just using multiple accumulators without vectorization?
"""

# ╔═╡ e924bcfd-4c51-42d3-a9ab-c9c2100ec63d
df_slides_xsimd_vs_acc = overview_labels(df_csr32_xsimd_best, df_csr32_xsimd_baseline)

# ╔═╡ fae9bdfb-a571-449c-aef1-0a3267e24c89
md"""
How far have we come compared to the naive implementation?
"""

# ╔═╡ 645f4b95-e354-489b-a3dc-b56abd17652e
md"""
How fast is this compared to Intel's MKL? Pretty good, I would say.
"""

# ╔═╡ 45f79dd6-c337-45f1-b7ef-dd0016ca5620
n_csr32 = length(levels(df_csr32.A_name))

# ╔═╡ fb4e29fb-c4ff-4155-bfbc-6d4b5f693892
desc_16bit_indices = ",\n16-bit column indices"

# ╔═╡ 3e723e2c-0005-4a12-b1e4-b5e84aef9549
df_csr16 = let df = select_damn_csr(df, "int16")
	df.slide_desc .*= desc_16bit_indices
	df
end

# ╔═╡ 476b1301-ec73-4c24-80a8-78b4295121a3
n_csr16 = length(levels(df_csr16.A_name))

# ╔═╡ c741e5b1-7687-4ece-9b33-b63a9afb32e9
df_csr8 = select_damn_csr(df, "int8")

# ╔═╡ 4c7d151f-9d79-454c-934e-a6298c65b788
n_csr8 = length(levels(df_csr8.A_name))

# ╔═╡ 78975aec-99d1-4bde-9b09-20b660e9d9ca
md"""
### Smaller Column Indices

While the outer indices in CSR storage (sometimes called "row pointers" or `rowptr`) have a range of 0 to `nnz`,
the inner indices ("column indices" or `colids`) only extend up to the number of columns,
which is much smaller than `nnz`.
If the matrix dimension does not exceed 2¹⁵ - 1 = 32767,
these inner indices can be stored using a 16-bit integer instead of a 32-bit one.
$n_csr16 out of $n_csr32 matrices, or $(@sprintf "%2.1f" 100n_csr16/n_csr32)%, allow such a representation.
$n_csr8 matrices, or $(@sprintf "%2.1f" 100n_csr8/n_csr32)%, even allow 8-bit integers to be used for the inner indices.
"""

# ╔═╡ 1bc8b9e8-fef4-4234-bbae-c9058ae23c5b
ideal_perf_ratio = (32 + A_scalar_bits) / (16 + A_scalar_bits)

# ╔═╡ e52cea5d-5a25-4a77-835e-9709e9ac18d5
ideal_speedup = ideal_perf_ratio - 1

# ╔═╡ 3eac77a5-7a56-4907-a3e6-ba8278530745
md"""
How does the best single-threaded 16-bit-index CSR implementation compare to the previously best 32-bit-index one?
In a memory-bound regime, the theoretical performance uplift is ~$(@sprintf "%2.0f" 100ideal_speedup)%,
which is indeed achieved once the traffic exceeds the size of the L3 cache (11 MiB LLC).
Moreover, the 16-bit-index implementation most of the time performs no worse than the 32-bit-index one.
Therefore, if viable, the smaller index type should be used.
"""

# ╔═╡ ed92e232-8604-486d-bea9-0cac08b11605
md"""
Occasionally, the performance uplift of CSR(int16) over CSR(int32) is even larger than predicted.
This is likely due to the fact that the indices fit into fewer memory pages (usually 4 KiB each).
Therefore, fewer communications to the main memory are necessary overall,
each imposing a certain latency.

One matrix is even able to realize a throughput that is higher than the memory bandwidth as measured by a Triad benchmark.
However, this is hardly significant.
"""

# ╔═╡ 8af55781-b53d-41fa-8a0d-ed14f5454729
df_csr16_xsimd_best = find_best(select_impl(df_csr16, impl_xsimd))

# ╔═╡ e037f27e-70c0-473c-9339-86c00ea5e6b6
md"""
Lets summarize by using 16-bit column indices where possible and 32-bit column indices otherwise.
"""

# ╔═╡ 0ed9c245-0583-4e89-b9b1-8f90ba7d170f
df_csr48_best = let
	df = antijoin(df_csr32, df_csr16, on=:A_name)
	append!(df, df_csr16)
	df = select_impl(df, impl_xsimd)
	df = find_best(df)
	df
end;

# ╔═╡ d95249c8-39ab-4240-a3ee-503832ef7597
df_too_quick = let
	too_quick = :throughput_relative => ByRow(>(1))
	df = subset(df_csr48_best, too_quick)
	df.traffic_MiB = df.traffic_B ./ 1024^2
	select!(df, Cols([:A_name, :A_iindex, :traffic_MiB, :throughput_relative]))
end

# ╔═╡ 6e113127-a2fa-4a63-b42b-c77887af3202
df_slides_csr16_vs_csr32 = let
	df = overview_labels(df_csr16_xsimd_best, df_csr32_xsimd_best)
	df.label .*= " ⁽*⁾"
	df.label[df.cache_level .== "Large"] .*= "\n\n(* if applicable)"
	df
end

# ╔═╡ 9e388fa0-ac35-4914-9838-77faa366f222
md"""
### Summary

What does the CSR(int16/int32) performance look like in absolute numbers?
"""

# ╔═╡ ed4b6a1e-99cf-4544-aa3c-57d8afa6710b
md"""
How does our CSR(int16/int32) implementation compare against Intel's MKL?
"""

# ╔═╡ 797250b7-f192-4dca-9e8a-288828ce259f
md"""
A considerable amount of the performance uplift over MKL is due to the smaller index type.
How does the best single-threaded CSR(int16) implementation compare to the MKL?
Note that the MKL only supports matching types for inner and outer indices,
as well as only 32-bit or 64-bit integers.
"""

# ╔═╡ 5555bff6-bf59-4bc0-b02b-48b41e66c212
md"""
The consistent uplift for off-chip traffic (right-most region) looks very promising.
As we will see in a bit, this is about the same as for DA-CSR,
with the benefit of DA-CSR being applicable to a much wider range of matrices.
"""

# ╔═╡ c4897041-d92e-4f4c-8542-6a92b0781dc9
csr_vs_mkl_perf_limits = (0.7, 1.8)

# ╔═╡ f7645262-e9dc-44d5-b297-a308e57f04a8
csr_vs_mkl_thrp_limits = (0.6, 1.7)

# ╔═╡ 832379fd-a286-48d7-a220-c8fd7f4a9c21
md"""
## Multi-Threaded SpMV

At this point, we may as well throw hardware at the problem.
Lets investigate how multi-threading affects SpMV performance.
"""

# ╔═╡ 9e38efae-6db8-4afc-af1d-be33c9de8096
omp_data = let
	d = Dict()
	for i in 2:2:8
		f = joinpath(path, "omp$i")
		isdir(f) || continue
		d[i] = read_spmv_data(f)
	end
	d
end;

# ╔═╡ 2c971eec-30c3-4f06-b22b-58961a792f61
df_omp = let df = copy(df)
	for (n, df_n) in omp_data
		df_n = add_metadata(df_n, n; exclude_sub_1KiB)
		append!(df, df_n)
	end
	df
end;

# ╔═╡ 6da0a70d-6415-429f-8798-e52f184eba5d
df_csr16_omp = let
	df = select_damn_csr(df_omp, "int16")
	df = select_impl(df, impl_xsimd)
end;

# ╔═╡ 6aab4d99-140a-402c-8350-8e7a94993315
df_csr32_omp = let
	df = select_damn_csr(df_omp, "int32")
	df = select_impl(df, impl_xsimd)
end;

# ╔═╡ b5584593-7277-4eca-9205-5e0a16b257e0
df_csr48_omp = let
	df = copy(df_csr16_omp)
	df.slide_desc .*= desc_16bit_indices
	df32 = antijoin(df_csr32_omp, df, on=:A_name)
	append!(df, df32)
end;

# ╔═╡ 666c70c1-4aa4-4288-ab9b-014efda8c501
df_csr32_xsimd_best_omp = combine(groupby(df_csr32_omp, :nthreads), find_best);

# ╔═╡ 20d567a8-43d8-4916-bd1d-db1bc849a706
df_csr48_xsimd_best_omp = combine(groupby(df_csr48_omp, :nthreads), find_best);

# ╔═╡ de5d79c5-98c0-4ffd-a331-aa1a82b2d3b0
df_omp_csr_mkl = subset(
	df_omp,
	:variant => ByRow(==(variant)),
	:impl_vendor => ByRow(==("MKL")),
);

# ╔═╡ 4963b6f9-9ad4-42a7-8d72-e8bedbde68eb
md"""
With traffic beyond the size of the CPU's L1 cache,
more threads nearly unconditionally result in better performance.
"""

# ╔═╡ f54e1e06-f62c-4787-8783-7a22624f1c04
md"""
When selecting the fastest implementation per matrix,
there only is a small transition phase where less than the maximum number of threads are optimal.
This region extends roughly from 32 KiB (size of L1) to 100 KiB,
which is way below 1024 KiB (size of L2).
"""

# ╔═╡ 80ba86a3-dddd-4fbb-bad8-046fdafae7f4
df_csr48_omp_best = find_best(df_csr48_omp);

# ╔═╡ 52cad7b5-866e-4098-81c2-d6ea22b308e1
count_impl(df_csr48_omp_best)

# ╔═╡ 706e076b-fb66-4cd3-972d-b6a6f907528e
df_csr32_omp_best = find_best(df_csr32_omp);

# ╔═╡ 56cebf10-cd38-4db2-940c-ee50dd18d84c
df_csr16_omp_best = find_best(df_csr16_omp);

# ╔═╡ 2a700192-842b-4d11-80d6-0616bd756964
total_cache = let
	l2 = only(cache.size[cache.level .== "L2"])
	l3 = only(cache.size[cache.level .== "L3"])
	maxthreads = maximum(keys(omp_data))
	total = maxthreads * l2 + l3
	total / 1024^2
end

# ╔═╡ 2c9b2424-2d31-46f7-80dc-e47b1d758501
md"""
#### CSR(int16) vs CSR(int32)

Even in a multi-threaded setting,
the CSR implementation using 16-bit column indices, CSR(int16),
performs on par with CSR(int32) while the traffic remains on-chip.
Only for larger problems with traffic within L3 we observe the expected $(@sprintf "%+2.0f" 100ideal_speedup)% speed-up.
However, due to multi-threading the total cache size increases to $total_cache MiB,
the size of L3 plus number of threads times L2.
Thus, many matrices fit into this combined cache,
which causes an even larger speed-up.
Beyond $total_cache MiB of traffic the speed-up is again close to but below $(@sprintf "%+2.0f" 100ideal_speedup)%.
"""

# ╔═╡ b09bb2c2-5a27-4706-a789-036bfb62fbc5
md"""
#### CSR(int32) vs MKL

The scaling behavior looks almost identical for Intel's MKL.
Overall, the comparison of the best DAMN CSR implementation to the MKL looks quite good, again.
"""

# ╔═╡ 60a52995-fd4f-4991-b5d0-fb9b13b58204
df_omp_csr_mkl_best = find_best(df_omp_csr_mkl);

# ╔═╡ a32c52e4-fe54-4cdc-a8fb-8bba71e99e84
md"#### CSR(int16/int32) vs MKL"

# ╔═╡ 468ac04f-1e94-49eb-aea9-6d5c46511971
md"""
# Comparison With DA-CSR

Let's compare the best CSR implementations with similar DA-CSR ones.
Some matrices did not achieve the promised bandwidth after applying a Reverse CutHill-McKee (RCM) permutation.
"""

# ╔═╡ 16221009-0aff-493d-8d4a-5d3dce800399
md"""
Indeed, the resulting bandwidth exceeds $(typemax(Int16)).
"""

# ╔═╡ 1ebfa3d0-c873-429e-9371-36dc2b4ded6f
md"""
```
$ list_matrices_nnz32 > /dev/null
level=info message="1367 matrices fit into CSR(int32,int32)"
level=info message="993 matrices (72.6%) fit into CSR(int32,int16)"
level=info message="1302 matrices (95.2%) fit into DA-CSR(int32,int16)"
```
"""

# ╔═╡ 69c0e5a1-7d93-41b3-9414-e56bfe1e4d0f
md"""
## Single-Threaded SpMV

Exclude the Eigen-alike implementation.
It is just another 2-accumulator implementation,
but for some reason, it behaves better than it should.
"""

# ╔═╡ 0a250130-3c08-4f28-89e9-2921327af919
df_dacsr = subset(
	df,
	:A_format => ByRow(==("DA-CSR")),
	:A_oindex => ByRow(==("int32")),
	:A_iindex => ByRow(==("int16")),
	:variant => ByRow(==(variant)),
);

# ╔═╡ cb01957c-8743-495c-8dcd-4cd25520a5ba
n_dacsr = length(levels(df_dacsr.A_name))

# ╔═╡ 40b4af69-d7ff-4098-b0ec-2560561a629f
md"""
Consider, again, the following implementations:

* naive
* single best using multiple scalar accumulators
* single best using explicit vectorization
"""

# ╔═╡ fdc0a2ad-56ec-4a24-9bb7-0d3643985146
impl_dacsr = let
	df = copy(impl_naive)
	subset_impl(df, desc) = subset(df, :impl_desc => ByRow(desc))
	append!(df, find_best_impl(subset_impl(df_dacsr, ==("simple")), 1))
	append!(df, find_best_impl(subset_impl(df_dacsr, startswith("xsimd")), 1))
	df
end

# ╔═╡ 46d43c34-fa41-49fe-94b1-0f0055cac140
df_dacsr_best = find_best(select_impl(df_dacsr, impl_dacsr));

# ╔═╡ 1682dc34-08e4-40f5-a8c3-ab19c9106ddc
combine(
	groupby(antijoin(df_csr32, df_dacsr_best, on=:A_name), :A_name),
	:A_bandwidth => first => :A_bandwidth,
)

# ╔═╡ 7ebed273-5400-4031-aea6-d95ca3b87ef2
begin
	same_impls_as_csr = isempty(antijoin(impl_xsimd, impl_dacsr, on=names(impl_dacsr)))
	if same_impls_as_csr
		md"""
		While we obtain the same selection of implementations for DA-CSR as for CSR,
		the distribution of which implementation performs best looks rather different.
		"""
	else
		md"""
		Note that we obtain a different set of optimal implementations.
		"""
	end
end

# ╔═╡ 21093972-89fd-466f-93e0-9a0e7a3d555f
count_impl(df_dacsr_best)

# ╔═╡ 4982c0d0-09bd-4791-a5ac-ace5a7b33994
count_impl(df_csr32_xsimd_best)

# ╔═╡ 93b4e1ad-ed80-4d77-8176-29aa29b79d56
md"""
Let's compare these single-threaded measurements with the baseline implementation.
As we will see, single-threaded DA-CSR performs on par with CSR(int16)
but is not able to obtain the anticipated performance uplift compared to CSR(int32).
"""

# ╔═╡ 1e651f68-9011-450b-a44c-5d4e37c5cfeb
md"""
### DA-CSR vs CSR(int16)

Consider only the matrices that have a CSR(int16) representation.
"""

# ╔═╡ 50b0d49e-e3cd-40bb-bf34-936eaddbb052
small_matrices = unique!(select(df_csr16, :A_name));

# ╔═╡ eaf78014-0fe4-4e57-8a4c-a7546fc184eb
md"""
### DA-CSR vs CSR(int32)
"""

# ╔═╡ 4141c689-741b-48c1-9719-30affea3026d
md"""
### DA-CSR vs CSR(int16/int32)
"""

# ╔═╡ 677e4957-834b-4991-bc76-0f4a2a953ebc
md"""
### DA-CSR vs MKL
"""

# ╔═╡ 4e3d793e-921c-4674-b125-58184f0fc6be
#dacsr_vs_csr_perf_limits = (0.85, 1.25) # single-threaded
dacsr_vs_csr_perf_limits = (0.85, 1.45)

# ╔═╡ 65488b5d-effb-4d5a-90cd-a845ba920ee1
#dacsr_vs_csr_thrp_limits = (0.75, 1.1) # single-threaded
dacsr_vs_csr_thrp_limits = (0.725, 1.2)

# ╔═╡ f196cc7a-1e18-4968-8bf7-b62aa34d2ad2
md"""
## Multi-Threaded SpMV

Again, exclude the "Eigen-alike" implementation of DA-CSR.
Its automatic switching between non-OpenMP and OpenMP may skew the results.
"""

# ╔═╡ eae290ac-e2ca-470f-ab80-d2cc31debbf7
df_dacsr_omp_best = let
	df = subset(
		df_omp,
		:A_format => ByRow(==("DA-CSR")),
		:A_oindex => ByRow(==("int32")),
		:A_iindex => ByRow(==("int16")),
		:variant => ByRow(==(variant)),
	)
	df = select_impl(df, impl_dacsr)
	find_best(df)
end;

# ╔═╡ 3b60e4f3-fbc8-4f62-84f1-667fd777371e
md"""
### DA-CSR vs CSR(int16)

Consider only the matrices that have a CSR(int16) representation.
DA-CSR performs on par with CSR(int16) throughout:
"""

# ╔═╡ 82c7bc2d-d577-4234-8be4-269346cd0f81
md"""
### DA-CSR vs CSR(int32)

Consider all matrices again.
In a multi-threaded setting,
DA-CSR obtains a speed-up close to the expected $(@sprintf "%+2.0f" 100ideal_speedup)%
once the traffic exceeds the total cache size, $total_cache MiB.
However, as the relative throughput is less than 1,
there remains some unrealised potential.
For larger traffic up to $total_cache MiB the speed-up is even larger than anticipated.
This is likely due to the fact that only half as many cache lines are necessary to load all the column indices.
"""

# ╔═╡ 3d7de276-05a5-49ee-b0fe-0b928a0aa57d
md"""
### DA-CSR vs CSR(int16/int32)
"""

# ╔═╡ 6f2d2c92-b279-4a00-bf63-78a5aa1ce35e
md"""
### DA-CSR vs MKL
"""

# ╔═╡ 799319b5-fe95-42d2-b44c-107c0f18c790
dacsr_vs_mkl_perf_limits = csr_vs_mkl_perf_limits

# ╔═╡ cd75fb8c-9040-49db-83dc-e7c8ab5a7103
dacsr_vs_mkl_thrp_limits = csr_vs_mkl_thrp_limits

# ╔═╡ 2731d186-8248-43e5-8b6a-a49316d7517e
md"""
## Example: Janna/Bump_2911

This is one of the largest matrices in the test set.
Let's see how it performed.
"""

# ╔═╡ 1f1795ef-5f25-4be6-a7a9-de71c98dc1f0
df_bump = let
	matrix = :A_name => ByRow(==("Janna/Bump_2911"))
	df = subset(df_dacsr_best, matrix)
	for other in (df_csr48_best, df_dacsr_omp_best, df_csr48_omp_best)
		append!(df, subset(other, matrix), cols=:intersect)
	end
	df.traffic_GiB = df.traffic_B ./ 1024^3
	df
end

# ╔═╡ 487c0837-af13-4483-8858-da4542187e86
maximum(df_dacsr_best.traffic_B) == first(df_bump.traffic_B)

# ╔═╡ 1a255baf-a61b-44fb-a2d3-9b73e944a5da
md"""
Going from CSR(32, 32) to DA-CSR(32, 16) reduces the SpMV traffic by ~16%
while also increasing the performance by ~12% to ~15%,
depending on the number of threads being used.
The throughput drops slightly to about 94% to 97%,
which means that the aforementioned performance gains could still be improved slightly,
but are overall very close to the theoretical maximum.
"""

# ╔═╡ 1a2415a2-ef35-4024-82a6-c158caeb69b5
combine(
	groupby(df_bump, :nthreads),
	:traffic_B => Base.splat(/) => :traffic_ratio,
	:traffic_B => Base.splat(\) => :inv_traffic_ratio,
	time_estimator => Base.splat(\) => :perf_ratio,
	"throughput_GiB/s" => Base.splat(/) => :throughput_ratio,
)

# ╔═╡ 19366ee0-172e-48be-8108-20466d202d62
md"""
## Example: GHS_psdef/ldoor

This is a matrix that requires a bandwidth reduction (RCM in our case) before the DA-CSR format can be applied.
"""

# ╔═╡ 2fc7d727-bc7d-484d-ae84-38a6a6d39994
df_ldoor = let
	matrix = :A_name => ByRow(endswith("ldoor"))
	df = subset(df_dacsr_best, matrix)
	for other in (df_csr48_best, df_dacsr_omp_best, df_csr48_omp_best)
		append!(df, subset(other, matrix), cols=:intersect)
	end
	df.traffic_MiB = df.traffic_B ./ 1024^2
	df
end

# ╔═╡ 15d2e0b1-e3f1-4e1f-8a65-049d90d441b5
md"""
Going from CSR(32, 32) to DA-CSR(32, 16) reduces the SpMV traffic again by ~16%
but this time the performance increases slightly more by ~15% to ~17%.
The throughput drops to 96% to 98%,
which means that the aforementioned performance gains are very close to the theoretical optimum.
"""

# ╔═╡ 9344a0bb-773a-4ecf-924c-05a03f5393bc
combine(
	groupby(df_ldoor, :nthreads),
	:traffic_B => Base.splat(/) => :traffic_ratio,
	:traffic_B => Base.splat(\) => :inv_traffic_ratio,
	time_estimator => Base.splat(\) => :perf_ratio,
	"throughput_GiB/s" => Base.splat(/) => :throughput_ratio,
)

# ╔═╡ 0aae5ee1-3b16-4998-87b9-17a709e3e3b8
subset(
	normalize(df_dacsr_omp_best, df_csr48_omp_best),
	:perf_relative => ByRow(>(1.18)),
	:traffic_B => ByRow(>(600*1024^2)),
)

# ╔═╡ 509af447-81f1-417c-91cc-38eb3ba53c34
example = "Oberwolfach/bone010"
#example = "Janna/Flan_1565"

# ╔═╡ 57ef4e3d-12fe-4367-9d8c-9d32fd430e4f
md"""
## Example: $example
"""

# ╔═╡ d6e98f18-7e3e-437b-8050-fb432a7038c1
df_example = let
	matrix = :A_name => ByRow(==(example))
	df = subset(df_dacsr_best, matrix)
	for other in (df_csr48_best, df_dacsr_omp_best, df_csr48_omp_best)
		append!(df, subset(other, matrix), cols=:intersect)
	end
	df.traffic_MiB = df.traffic_B ./ 1024^2
	df.traffic_GiB = df.traffic_B ./ 1024^3
	df
end

# ╔═╡ b3844c5b-fec5-4cac-84b0-7e39d92779e0
combine(
	groupby(df_example, :nthreads),
	:traffic_B => Base.splat(/) => :traffic_ratio,
	:traffic_B => Base.splat(\) => :inv_traffic_ratio,
	time_estimator => Base.splat(\) => :perf_ratio,
	"throughput_GiB/s" => Base.splat(/) => :throughput_ratio,
)

# ╔═╡ 32a331e9-09bc-4384-ac60-de4f256c3ca3
md"""
# Summary and Outlook

* Use the smallest integer type possible;
  using CSR(int32, int16) over CSR(int32, int32) yields
  * Single-threaded, L2 <= traffic <= L3: ca +5% performance
  * Single- or multi-threaded, off-chip traffic: ca +17% performance
* DA-CSR applicable to many matrices:
  $n_dacsr of 1367 matrices, or $(@sprintf "%2.1f" 100n_dacsr/1367)%
  of real square matrices from the SuiteSparse Matrix Collection
  having full structural rank that fit into CSR(int32, int32)
* Performance of DA-CSR(int32, int16)
  * On-par with CSR(int32, int16)
  * Off-chip traffic: ca +17% faster than CSR(int32, int32), including MKL

Recall that DA-storage is a technique that can be applied to other data types as well:

* DA-CSR₄ (has separate row start and end pointers), DA-CSR₅ ([Liu, Vinter 2015](https://arxiv.org/abs/1503.05032)), and other variants of CSR that change the data layout
* DA-SELL-$C$-$\sigma$ ([Kreutzer et al. 2014](https://blogs.fau.de/essex/files/2012/11/SELL-C-sigma.pdf))
* DA-CSR₃ as the leaf type within RSB ([Martone et al. 2010](https://librsb.sourceforge.net/))

Future Work:
Create smart/automatic entry-point for SpMV computations:

* Move book-keeping overhead from row- to matrix-level
  by using `nnz/nrows` instead of `nnz[row]` --
  this should help the CPU's branch prediction
* Train decision tree to select fastest implementation
* Use that to hard-code dispatch
* Build a DAMN library for CSR and DA-CSR SpMV

Furthermore:

* Benchmark SpMV on AMD -- aligned `xsimd` loads seem to be much more important there
* SpMM -- needed for e.g. Alternating-Directions Implicit (ADI) method
"""

# ╔═╡ f16be015-274d-45ab-a4f8-940b2bd92257
md"# Appendix"

# ╔═╡ 3c20500a-6f84-4365-ad29-89bc28bbb902
md"""
## Plotting Packages

Makie's `FileIO.save()` does not work with an `IOBuffer`,
so we have to save to an actual file instead.
Dump these files into `tmp`.
Figures intended for presentation/publication will be stored at `fig_path`.
"""

# ╔═╡ cfe37852-631e-46bc-b66b-e498b07bb87c
tmp = mktempdir()

# ╔═╡ 8d962291-1e09-4537-a0d9-f501f68204f7
fig_path = let
	p = joinpath(path, "figures")
	mkpath(p)
	p
end

# ╔═╡ e38339a2-67dc-4dc7-8526-4246be1961a5
axis = (
	xscale = log10,
	xticks = BytesTicks(),
	xminorticks = BytesTicks(),
	xminorticksvisible = true,
	xminorgridvisible = true,
	yminorticksvisible = true,
	yminorgridvisible = true,
	yticklabelspace = 20.0, # to match width of draw_overview_omp()
);

# ╔═╡ f7fc74d0-e9da-4bed-a761-1a8fc903d73c
colorbar = (
	tickformat=vs -> ["$(Int(10^v))" for v in vs],
);

# ╔═╡ 72718d96-5560-42d9-b481-3ee7f5d6e000
axis_traffic = "traffic_B" => "Traffic";

# ╔═╡ c5394a1b-71c9-4036-a6c6-9fbfc6e19919
axis_throughput = "throughput_GiB/s" => "Throughput [GiB/s]";

# ╔═╡ f7f8d346-48d4-4fa1-a8e5-906dfd287ad8
axis_throughput_rel = "throughput_relative" => "Relative Throughput";

# ╔═╡ e7252665-e6b9-4b94-bd32-e457fba186a2
axis_perf_abs = "perf_FLOP/s" => (x -> x / 1000^3) => "Performance [GFLOP/s]";

# ╔═╡ 8e090ee9-6c76-4d4c-a824-7daf13a297c9
axis_perf_rel = "perf_relative" => "Relative Performance";

# ╔═╡ a77826bb-cb81-480f-87a0-0fda0c7574e8
map_relperf_vs_traffic = mapping(axis_traffic, axis_perf_rel);

# ╔═╡ ec22cfed-8886-46e8-83ce-07e45e90855a
color_fatness = mapping(color="A_nnz/nrows" => log10 => "Average nnz per row");

# ╔═╡ 080c4c28-3b62-492d-a2b5-461ec3866515
"""
Compare the performance of different implementations akin to a pair plot.
"""
function versusplot(df, groupcols::Union{String,Symbol})
	gdf = groupby(df, groupcols, sort=true)
	
	df_vs = DataFrame()

	for k in keys(gdf)
		df_foo = normalize(df, gdf[k])
		df_foo.vs .= "vs. $(only(k))"
		append!(df_vs, df_foo, cols=:union)
	end

	plt = data(df_vs) *
		mapping(
			:traffic_B => "Traffic",
			:perf_relative => "Relative Performance",
			col=groupcols => nonnumeric,
			row=:vs,
		) *
		color_fatness
end

# ╔═╡ e61dab3e-18ca-4cb3-8302-0df674db512a
color_threads = mapping(color=:nthreads => nonnumeric => "#Threads");

# ╔═╡ baf6d4f6-192b-4a23-8075-828bbc447254
overview_abs_abs = mapping(
	axis_traffic,
	Any[
		axis_perf_abs,
		"throughput_GiB/s" => "Throughput [GiB/s]",
	],
	row=dims(1),
);

# ╔═╡ 9d2e9033-b197-40e4-bb35-d7a7b2f2de81
overview_abs_rel =
	mapping(
		axis_traffic,
		Any[axis_perf_abs, axis_throughput_rel],
		row=dims(1),
	) *
	color_fatness;

# ╔═╡ 104c83cc-93d5-43e9-b1ce-5270b11c3f78
overview_rel_rel =
	mapping(
		axis_traffic,
		Any[axis_perf_rel, axis_throughput_rel],
		row=dims(1),
	) *
	color_fatness;

# ╔═╡ beba8f6d-ad04-404e-9b69-b670f9d119ec
marker_impl = mapping(
	marker = :impl_summary_full =>
		sorter(vcat(
		"naive",
		["$acc acc." for acc in 1:8],
		"Eigen-alike",
		["1 vector-acc.\n(init-first $align)"
			for align in ("unaligned", "aligned")],
		["$acc vector-acc.\n(init-first)"
			for acc in 1:4],
		["1 vector-acc.\n(init-zero $align)"
			for align in ("unaligned", "aligned")],
		["$acc vector-acc.\n(init-zero)"
			for acc in 1:4],
		"MKL v202101",
		"Eigen v3.4.0",
	)) =>
		"Implementation",
);

# ╔═╡ b2da69cd-38ed-4516-b7e4-17013462ace5
plt_cache = data(cache) *
	mapping(:size => last(axis_traffic)) *
	visual(VLines);

# ╔═╡ 86afd175-eed8-497d-bcd9-76e1f955eed0
function damn_draw(plt; title=nothing, kwargs...)
	_axis = axis
	if title != nothing
		_axis = merge((; title), axis)
	end
	draw(plt_cache + plt; axis=_axis, colorbar, kwargs...)
end

# ╔═╡ f5edc2f3-9bbd-4c52-99ca-fce788cbb2b7
let
	plt = data(df_throughput) *
		mapping(
			axis_traffic,
			axis_throughput,
			color=:description => "Description",
		)
	damn_draw(plt)
end

# ╔═╡ ae4ab095-3261-4f02-b6e1-12dd2d6baeb2
let
	df = normalize(df_csr32_naive, df_csr_mkl)
	damn_draw(data(df) * map_relperf_vs_traffic * color_fatness)
end

# ╔═╡ 93c6ebca-ce89-40a6-8eb6-01c70c51c521
let
	plt = versusplot(df_csr32_acc_and_naive, :impl_summary)
	with_theme(markersize=3) do
		global fig_csr_acc_pairs =
			damn_draw(plt; figure=(resolution=(1200, 900),))
	end
end

# ╔═╡ 7bf21dce-dd37-4054-8876-0ddbd7260c79
let
	# Makie's save() does not work with an IOBuffer,
	# so we have to save to an actual file instead.
	fname = joinpath(tmp, "csr_acc_pairs.pdf")
	save(fname, fig_csr_acc_pairs)
	DownloadButton(read(fname), "csr_acc_pairs.pdf")
end

# ╔═╡ 0d5e2b30-58ca-4447-97e8-7b31bceeb0d6
let impls = select(df_csr32_xsimd_baseline, names(impls))
	unique!(impls)
	append!(impls, find_best_impl(df_csr32_xsimd, 2))
	df = select_impl(df_csr32_xsimd_and_baseline, impls)
	plt = versusplot(df, :impl_summary)
	damn_draw(plt)
end

# ╔═╡ e9b820d3-9c1f-4ad7-8223-74541c8c7e6d
let
	df = normalize(df_csr32_xsimd_best, df_csr32_naive)
	damn_draw(
		data(df) *
		map_relperf_vs_traffic *
		color_fatness *
		marker_impl;
	)
end

# ╔═╡ f5279105-83b6-4a60-9582-84e7ab9d793d
let
	plt = data(df_csr48_xsimd_best_omp) *
		overview_abs_abs *
		color_threads *
		marker_impl
	damn_draw(plt)
end

# ╔═╡ 2ffaf292-b31f-460b-8d40-67b5a5162ad4
let
	plt = data(df_csr48_omp_best) * overview_abs_abs *
		color_threads *
		marker_impl;
	damn_draw(plt)
end

# ╔═╡ 0cf3b21c-1900-4d57-aa18-c7023c55ca94
let
	plt = data(df_dacsr_omp_best) * overview_abs_abs *
		color_threads *
		marker_impl;
	damn_draw(plt)
end

# ╔═╡ c6c36217-2dd2-44c7-9676-236d9cf35981
function draw_overview(
	df,
	traffic_limits = extrema(df.traffic_B),
	;
	perf_labels = vcat(cache.level, "Large"),
)
	fig = Figure(resolution=(1200, 600))
	ax1 = Axis(fig[1,1]; axis...)
	ax2 = Axis(fig[2,1]; axis..., yticks=0:0.2:1.2)
	hidexdecorations!(ax1, minorgrid=false, grid=false)

	# Fix axis limits
	ymax1 = 4.0
	ymax2 = 1.0
	Δ1 = 0.05 * ymax1
	Δ2 = 0.05 * ymax2
	xmin, xmax = traffic_limits
	xmin *= 0.5
	xmax *= 1.5
	xlims!(ax1, xmin, xmax)
	xlims!(ax2, xmin, xmax)
	ylims!(ax1, -Δ1, ymax1 + Δ1)
	ylims!(ax2, -Δ2, ymax2 + Δ2)

	# Mark cache sizes
	vlines!(ax1, cache.size, color=:black)
	vlines!(ax2, cache.size, color=:black)
	min_traffic, max_traffic = traffic_limits
	sizes = vcat(
		min_traffic,
		cache.size,
		max_traffic
	)
	nlabels = length(sizes) - 1
	label_pos = [sqrt(sizes[i] * sizes[i+1]) for i in 1:nlabels]
	label_txt = vcat(cache.level, "Large")
	text!(ax1, label_pos, fill(ymax1, nlabels),
		text=perf_labels,
		align=(:center, :top),
	)
	text!(ax2, label_pos, fill(ymax2, nlabels),
		text=label_txt,
		align=(:center, :top),
	)

	# Add measurements
	kwargs = (
		color = "A_nnz/nrows" => log10,
		marker = :slide_desc => sorter([
			"naive",
			"multiple accumulators",
			"explicit vectorization",
			"naive,\n16-bit column indices",
			"multiple accumulators,\n16-bit column indices",
			"explicit vectorization,\n16-bit column indices",
		]) => "Implementation",
	)
	palettes = (
		marker = [:circle, :utriangle, :cross, :rect, :dtriangle, :xcross],
	)
	perf = draw!(ax1, data(df) * mapping(
		:traffic_B,
		"perf_FLOP/s" => p -> p/1000^3,
		; kwargs...
	); palettes)
	draw!(ax2, data(df) * mapping(
		:traffic_B,
		:throughput_relative,
		; kwargs...
	); palettes)
	colorbar!(fig[1:2,2], perf,
		label="Average nnz per row",
		tellheight=false,
		flipaxis=true,
		#vertical=false,
		; colorbar...)
	legend!(fig[1:2,3], perf,
		width=220,
		#valign=:top,
	)

	Label(fig[1, 0], "Performance [GFLOP/s]", rotation=pi/2, tellheight=false)
	Label(fig[2, 0], "Relative Throughput", rotation=pi/2, tellheight=false)

	fig
end

# ╔═╡ 8c91ff03-b9bc-4bf2-9838-94c9bb90a4ea
draw_overview(df_dacsr_best)

# ╔═╡ 37214219-e656-4f18-87d5-2b85a9abf270
function draw_overview_omp(
	df,
	traffic_limits = extrema(df.traffic_B),
	;
	perf_labels = vcat(cache.level, "Large"),
	perf_max = nothing,
)
	fig = Figure(resolution=(1200, 600))
	ax1 = Axis(fig[1,1]; axis...)
	ax2 = Axis(fig[2,1]; axis...)
	hidexdecorations!(ax1, minorgrid=false, grid=false)

	# Fix axis limits
	xmin, xmax = traffic_limits
	xmin *= 0.5
	xmax *= 1.5
	xlims!(ax1, xmin, xmax)
	xlims!(ax2, xmin, xmax)
	ymax1 = maximum(df."perf_FLOP/s") / 1000^3
	if perf_max != nothing
		ymax1 = perf_max
		Δ = 0.05 * perf_max
		ylims!(ax1, -Δ, perf_max + Δ)
	end

	# Mark cache sizes
	ymax2 = maximum(df."throughput_GiB/s")
	vlines!(ax1, cache.size, color=:black)
	vlines!(ax2, cache.size, color=:black)
	min_traffic, max_traffic = traffic_limits
	sizes = vcat(
		min_traffic,
		cache.size,
		max_traffic
	)
	nlabels = length(sizes) - 1
	label_pos = [sqrt(sizes[i] * sizes[i+1]) for i in 1:nlabels]
	label_txt = vcat(cache.level, "Large")
	text!(ax1, label_pos, fill(ymax1, nlabels),
		text=perf_labels,
		align=(:center, :top),
	)
	text!(ax2, label_pos, fill(ymax2, nlabels),
		text=label_txt,
		align=(:center, :top),
	)

	# Add measurements
	kwargs = (
		color = "A_nnz/nrows" => log10,
		marker = :slide_desc => sorter([
			"naive",
			"multiple accumulators",
			"explicit vectorization",
			"naive,\n16-bit column indices",
			"multiple accumulators,\n16-bit column indices",
			"explicit vectorization,\n16-bit column indices",
		]) => "Implementation",
	)
	palettes = (
		marker = [:circle, :utriangle, :cross, :rect, :dtriangle, :xcross],
	)
	perf = draw!(ax1, data(df) * mapping(
		:traffic_B,
		"perf_FLOP/s" => p -> p/1000^3,
		; kwargs...
	); palettes)
	draw!(ax2, data(df) * mapping(
		:traffic_B,
		"throughput_GiB/s",
		; kwargs...
	); palettes)
	colorbar!(fig[1:2,2], perf,
		label="Average nnz per row",
		tellheight=false,
		flipaxis=true,
		#vertical=false,
		; colorbar...)
	legend!(fig[1:2,3], perf,
		width=220,
		#valign=:top,
	)

	Label(fig[1, 0], "Performance [GFLOP/s]", rotation=pi/2, tellheight=false)
	Label(fig[2, 0], "Throughput [GiB/s]", rotation=pi/2, tellheight=false)

	fig
end

# ╔═╡ b3b326bd-36d9-412f-ad66-fc5dcda2fb56
function draw_comparison(
	df_candidate,
	df_baseline,
	traffic_limits = extrema(df.traffic_B),
	;
	perf_limits = nothing,
	throughput_limits = nothing,
	perf_axis::NamedTuple = (yticks=0.0:0.2:2.0,),
	throughput_axis::NamedTuple = (yticks=0.0:0.2:2.0,),
)
	# Normalize performance and throughput
	@assert levels(df_baseline.A_name) ⊇ levels(df_candidate.A_name)
	cols = ["A_name"]#, "variant"]
	df = leftjoin(
		df_candidate,
		df_baseline[!, vcat(cols, "perf_FLOP/s", "throughput_GiB/s")],
		on=cols,
		renamecols=""=>"_baseline",
	)
	df.perf_vs_baseline = df."perf_FLOP/s" ./ df."perf_FLOP/s_baseline"
	df.throughput_vs_baseline = df."throughput_GiB/s" ./ df."throughput_GiB/s_baseline"
	select!(df, Not(["perf_FLOP/s_baseline", "throughput_GiB/s_baseline"]))

	# Create plot
	fig = Figure(resolution=(1200, 600))
	ax1 = Axis(fig[1,1]; axis..., perf_axis...)
	ax2 = Axis(fig[2,1]; axis..., throughput_axis...)
	hidexdecorations!(ax1, minorgrid=false, grid=false)

	# Fix axis limits
	@info("Data limits",
		extrema(df.perf_vs_baseline),
		extrema(df.throughput_vs_baseline),
	)
	xmin, xmax = traffic_limits
	xmin *= 0.5
	xmax *= 1.5
	xlims!(ax1, xmin, xmax)
	xlims!(ax2, xmin, xmax)
	ymin1, ymax1 = extrema(df.perf_vs_baseline)
	ymin2, ymax2 = extrema(df.throughput_vs_baseline)
	subset_cropped(col, fn) = subset(
		df[!, [:A_name, :traffic_h, :perf_vs_baseline, :throughput_vs_baseline]],
		col => ByRow(fn),
	)
	if perf_limits != nothing
		ymin, ymax = perf_limits
		Δ = 0.05 * (ymax - ymin)
		ymin -= Δ
		ymax += Δ
		ylims!(ax1, ymin, ymax)
		cropped = subset_cropped(:perf_vs_baseline, <(ymin))
		isempty(cropped) || @warn "Lower perf limit causes cropping" cropped
		cropped = subset_cropped(:perf_vs_baseline, >(ymax))
		isempty(cropped) || @warn "Upper perf limit causes cropping" cropped
		ymax1 = perf_limits[2]
	end
	if throughput_limits != nothing
		ymin, ymax = throughput_limits
		Δ = 0.05 * (ymax - ymin)
		ymin -= Δ
		ymax += Δ
		ylims!(ax2, ymin, ymax)
		cropped = subset_cropped(:throughput_vs_baseline, <(ymin))
		isempty(cropped) || @warn "Lower throughput limit causes cropping" cropped
		cropped = subset_cropped(:throughput_vs_baseline, >(ymax))
		isempty(cropped) || @warn "Upper throughput limit causes cropping" cropped
		ymax2 = throughput_limits[2]
	end

	# Mark cache sizes
	vlines!(ax1, cache.size, color=:black)
	vlines!(ax2, cache.size, color=:black)

	# Add annotations on average change vs baseline
	df_annotations = combine(
		groupby(df, :cache_level),
		:perf_vs_baseline => mean => :perf_ratio,
		:throughput_vs_baseline => mean => :throughput_ratio,
	)
	df_annotations.perf_label =
		["$(@sprintf "%+2.1f" 100(p - 1))%" for p in df_annotations.perf_ratio]
	df_annotations.throughput_label =
		["$(@sprintf "%+2.1f" 100(p - 1))%" for p in df_annotations.throughput_ratio]
	sort!(df_annotations, :cache_level)
	min_traffic, max_traffic = traffic_limits
	sizes = vcat(
		min_traffic,
		cache.size,
		max_traffic
	)
	nlabels = length(sizes) - 1
	label_pos = [sqrt(sizes[i] * sizes[i+1]) for i in 1:nlabels]
	label_txt = vcat(cache.level, "Large")
	text!(ax1, label_pos, fill(ymax1, nlabels),
		text=df_annotations.perf_label,
		align=(:center, :top),
	)
	text!(ax2, label_pos, fill(ymax2, nlabels),
		text=df_annotations.throughput_label,
		align=(:center, :top),
	)

	# Add measurements
	kwargs = (
		color = "A_nnz/nrows" => log10,
		marker = :slide_desc => sorter([
			"naive",
			"multiple accumulators",
			"explicit vectorization",
			"naive,\n16-bit column indices",
			"multiple accumulators,\n16-bit column indices",
			"explicit vectorization,\n16-bit column indices",
			"float32",
			"float64",
		]) => "Implementation",
	)
	palettes = (
		marker = [:circle, :utriangle, :cross, :rect, :dtriangle, :xcross],
	)
	perf = draw!(ax1, data(df) * mapping(
		:traffic_B,
		:perf_vs_baseline,
		; kwargs...
	); palettes)
	draw!(ax2, data(df) * mapping(
		:traffic_B,
		:throughput_vs_baseline,
		; kwargs...
	); palettes)
	colorbar!(fig[1:2,2], perf,
		label="Average nnz per row",
		tellheight=false,
		flipaxis=true,
		#vertical=false,
		; colorbar...)
	legend!(fig[1:2,3], perf,
		width=220,
		#valign=:top,
	)

	Label(fig[1, 0], "Relative Performance", rotation=pi/2, tellheight=false)
	Label(fig[2, 0], "Relative Throughput", rotation=pi/2, tellheight=false)

	fig
end

# ╔═╡ b97a6697-d678-4cf5-9ded-f6f2752c3815
draw_comparison(df_csr16_xsimd_best, df_csr32_xsimd_best)

# ╔═╡ 3384cf73-67ad-4554-b8b3-dd9aee7b89a1
draw_comparison(df_csr16_xsimd_best, df_csr_mkl)

# ╔═╡ a97f20c7-7587-4854-8249-638c159d8f04
draw_comparison(df_csr16_omp_best, df_csr32_omp_best)

# ╔═╡ 33f6450a-7f65-40e6-a69f-346ec37c75f6
traffic_limits = extrema(df_csr48_best.traffic_B)

# ╔═╡ 1634495e-fbcd-4396-9a4a-2bab0067c806
let
	fig = draw_overview(df_csr32_naive, traffic_limits)
	fname = abspath(joinpath(fig_path, "$(variant)_csr32_naive.pdf"))
	@info fname
	save(fname, fig)
	fig
end

# ╔═╡ ed778294-0fde-454c-80bf-499e860144f5
let
	fig = draw_overview(df_csr32_acc_best, traffic_limits;
		perf_labels = df_slides_acc_vs_naive.label,
	)
	fname = abspath(joinpath(fig_path, "$(variant)_csr32_acc_annotated.pdf"))
	@info fname
	save(fname, fig)
	fig
end

# ╔═╡ 8f54ba7d-21a1-48f6-b3ac-a038c0a9713f
let
	fig = draw_overview(df_csr32_xsimd_best, traffic_limits;
		perf_labels = df_slides_xsimd_vs_acc.label,
	)
	fname = abspath(joinpath(fig_path, "$(variant)_csr32_xsimd_annotated.pdf"))
	@info fname
	save(fname, fig)
	fig
end

# ╔═╡ 3b734bcc-4202-4845-b824-5f77e86fa969
let
	fname = abspath(joinpath(fig_path, "$(variant)_csr32-vs-mkl.pdf"))
	fig = draw_comparison(df_csr32_xsimd_best, df_csr_mkl, traffic_limits;
		perf_limits = csr_vs_mkl_perf_limits,
		throughput_limits = csr_vs_mkl_thrp_limits,
	)
	save(fname, fig)
	@info fname
	fig
end

# ╔═╡ e923e717-2273-4c26-aed5-8fecc994d037
let
	fig_annotated = draw_overview(df_csr48_best, traffic_limits;
		perf_labels = df_slides_csr16_vs_csr32.label,
	)
	fname = abspath(joinpath(fig_path, "$(variant)_csr48_xsimd_annotated.pdf"))
	@info fname
	save(fname, fig_annotated)
	fig_annotated
end

# ╔═╡ c1747831-b53a-4a89-a200-7bfa886323e1
let
	fig = draw_overview_omp(df_csr48_best, traffic_limits;
		perf_max = 4,
	)
	fname = abspath(joinpath(fig_path, "$(variant)_csr48.pdf"))
	@info fname
	save(fname, fig)
	fig
end

# ╔═╡ aeaac19a-bf53-464a-b632-cb110ef487be
let
	fname = abspath(joinpath(fig_path, "$(variant)_csr48-vs-mkl.pdf"))
	fig = draw_comparison(df_csr48_best, df_csr_mkl, traffic_limits;
		perf_limits = csr_vs_mkl_perf_limits,
		throughput_limits = csr_vs_mkl_thrp_limits,
	)
	save(fname, fig)
	@info fname
	fig
end

# ╔═╡ 5fd9386c-0053-4a1f-8b38-da4735dfe601
let
	fig = draw_overview_omp(df_csr48_omp_best, traffic_limits)
	fname = abspath(joinpath(fig_path, "$(variant)_csr48_omp.pdf"))
	@info fname
	save(fname, fig)
	fig
end

# ╔═╡ cabf7736-0228-43ad-987d-d422cf929c16
let
	fig = draw_overview_omp(df_csr32_omp_best, traffic_limits)
	fname = abspath(joinpath(fig_path, "$(variant)_csr32_omp.pdf"))
	@info fname
	save(fname, fig)
	fig
end

# ╔═╡ 80561378-e841-45cb-b48a-0ac2731f1aeb
let
	fname = abspath(joinpath(fig_path, "$(variant)_csr32-vs-mkl_omp.pdf"))
	fig = draw_comparison(df_csr32_omp_best, df_omp_csr_mkl_best, traffic_limits;
		perf_limits = csr_vs_mkl_perf_limits,
		throughput_limits = csr_vs_mkl_thrp_limits,
	)
	save(fname, fig)
	@info fname
	fig
end

# ╔═╡ 2051f633-9e71-4435-938c-0d55f422c305
let
	fname = abspath(joinpath(fig_path, "$(variant)_csr48-vs-mkl_omp.pdf"))
	fig = draw_comparison(df_csr48_omp_best, df_omp_csr_mkl_best, traffic_limits;
		perf_limits = csr_vs_mkl_perf_limits,
		throughput_limits = csr_vs_mkl_thrp_limits,
	)
	save(fname, fig)
	@info fname
	fig
end

# ╔═╡ 2e87451a-8271-472a-930f-8a1012843889
let
	fname = abspath(joinpath(fig_path, "$(variant)_dacsr-vs-csr16.pdf"))
	df = innerjoin(df_dacsr_best, small_matrices, on=:A_name)
	fig = draw_comparison(df, df_csr16_xsimd_best,
		traffic_limits,
		perf_limits = dacsr_vs_csr_perf_limits,
		throughput_limits = dacsr_vs_csr_thrp_limits,
	)
	save(fname, fig)
	@info fname
	fig
end

# ╔═╡ 68fcf117-9465-4ac2-91b0-591712180e0d
let
	fname = abspath(joinpath(fig_path, "$(variant)_dacsr-vs-csr32.pdf"))
	fig = draw_comparison(df_dacsr_best, df_csr32_xsimd_best,
		traffic_limits,
		perf_limits = dacsr_vs_csr_perf_limits,
		throughput_limits = dacsr_vs_csr_thrp_limits,
	)
	save(fname, fig)
	@info fname
	fig
end

# ╔═╡ 2ca6e915-8db1-4187-8b33-ba73654495b8
let
	fname = abspath(joinpath(fig_path, "$(variant)_dacsr-vs-csr48.pdf"))
	fig = draw_comparison(df_dacsr_best, df_csr48_best, traffic_limits;
		perf_limits = dacsr_vs_csr_perf_limits,
		throughput_limits = dacsr_vs_csr_thrp_limits,
	)
	save(fname, fig)
	@info fname
	fig
end

# ╔═╡ ee30a184-7fe3-4ea9-af0e-06c3678ae832
let
	fname = abspath(joinpath(fig_path, "$(variant)_dacsr-vs-mkl.pdf"))
	fig = draw_comparison(df_dacsr_best, df_csr_mkl, traffic_limits;
		perf_limits = dacsr_vs_mkl_perf_limits,
		throughput_limits = dacsr_vs_mkl_thrp_limits,
	)
	save(fname, fig)
	@info fname
	fig
end

# ╔═╡ b4ff2923-3dd9-46dc-bcd8-12462a16e5c6
let
	fname = abspath(joinpath(fig_path, "$(variant)_dacsr-vs-csr16_omp.pdf"))
	small_matrices = select(df_csr16, :A_name)
	unique!(small_matrices)
	df = innerjoin(df_dacsr_omp_best, small_matrices, on=:A_name)
	fig = draw_comparison(df, df_csr16_omp_best,
		traffic_limits,
		perf_limits = dacsr_vs_csr_perf_limits,
		throughput_limits = dacsr_vs_csr_thrp_limits,
	)
	save(fname, fig)
	@info fname
	fig
end

# ╔═╡ 0a64fca3-0351-4f78-ac74-4af87941a82d
let
	fname = abspath(joinpath(fig_path, "$(variant)_dacsr-vs-csr32_omp.pdf"))
	fig = draw_comparison(df_dacsr_omp_best, df_csr32_omp_best,
		traffic_limits,
		perf_limits = dacsr_vs_csr_perf_limits,
		throughput_limits = dacsr_vs_csr_thrp_limits,
	)
	save(fname, fig)
	@info fname
	fig
end

# ╔═╡ c890a64f-1011-4124-8f57-028339ccf4c0
let
	fname = abspath(joinpath(fig_path, "$(variant)_dacsr-vs-csr48_omp.pdf"))
	fig = draw_comparison(df_dacsr_omp_best, df_csr48_omp_best, traffic_limits;
		perf_limits = dacsr_vs_csr_perf_limits,
		throughput_limits = dacsr_vs_csr_thrp_limits,
	)
	save(fname, fig)
	@info fname
	fig
end

# ╔═╡ 28e3172a-6356-4e92-998c-d1add7b18fb1
let
	fname = abspath(joinpath(fig_path, "$(variant)_dacsr-vs-mkl_omp.pdf"))
	fig = draw_comparison(df_dacsr_omp_best, df_omp_csr_mkl_best, traffic_limits;
		perf_limits = dacsr_vs_mkl_perf_limits,
		throughput_limits = dacsr_vs_mkl_thrp_limits,
	)
	save(fname, fig)
	@info fname
	fig
end

# ╔═╡ 1865065c-144a-4c2d-a548-3c0c38c6fa20
md"""
## Expected Performance

Assume the performance would scale perfectly,
i.e. the throughputs for DA-CSR(int16) and CSR(int32) were *identical*.
Would relative performance tend towards the limit of $(@sprintf "%2.0f" 100ideal_perf_ratio)% as anticipated?
Scaling the single-threaded measurements accordingly,
the DA-CSR vs CSR comparision looks as follows.
"""

# ╔═╡ 4f04194d-d766-45c1-91bf-6b7bbd47e5f7
df_slides_dacsr_expected = let
	df = copy(df_dacsr_best)
	df = leftjoin(
		df,
		df_csr32_xsimd_best[!, ["A_name", "variant", "throughput_GiB/s"]],
		on = ["A_name", "variant"],
		renamecols = "" => "_baseline",
	)
	df."perf_FLOP/s" .*=
		df."throughput_GiB/s_baseline" ./
		df."throughput_GiB/s"
	df."throughput_GiB/s" = df."throughput_GiB/s_baseline"
	df[!, Not("throughput_GiB/s_baseline")]
end

# ╔═╡ 1a4051a5-64ac-4f3f-9fce-749aa988a855
draw_comparison(df_slides_dacsr_expected, df_csr32_xsimd_best)

# ╔═╡ ccc215b0-afde-4dc1-907a-853bdb98fe2a
md"""
## Multi-Precision

When exchanging 64-bit scalars for 32-bit ones,
one would expect a 50% uplift in performance when using CSR(int32).
Unfortunately, not even the MKL is able to do so.
"""

# ╔═╡ 4b5f30f0-7afc-4645-a08a-049f8ea3f3c1
let
	df_all = subset(
		df_omp,
		:A_format => ByRow(==("CSR")),
		:A_oindex => ByRow(==("int32")),
		:impl_vendor => ByRow(==("MKL")),
	)
	df_all.slide_desc = df_all.A_scalar
	df_fp32 = find_best(subset(
		df_all,
		:variant => ByRow(==("A32_x32_y32")),
	))
	df_fp64 = find_best(subset(
		df_all,
		:variant => ByRow(==("A64_x64_y64")),
	))
	draw_comparison(df_fp32, df_fp64)
end

# ╔═╡ 1fe4233f-b613-4618-8604-60422f9daab4
md"""
To summarize, when comparing

* CSR(int32, *int16*, double) to CSR(int32, *int32*, double), or
* DA-CSR(int32, *int16*, double) to CSR(int32, *int32*, double), or
* MKL CSR(int32, int32, *float*) to CSR(int32, int32, *double*)

the relative throughput is less than 1 for larger off-chip traffic.
That means in all these cases the throughput drops slightly despite reducing the amount of traffic.

!!! info "Conjecture"
    The SpMV is not heavily limited by the memory bandwidth.
    Instead, the performance is close to another limiting factor.

Finding and lifting this other cause remains future research.
"""

# ╔═╡ df82699b-3aa3-4cab-8aa0-f2409a4dccf6
md"""
## Eigen vs MKL

How much worse is Eigen?
"""

# ╔═╡ b2ee7359-8d59-40a1-9c7f-8c5c4553f7f2
let
	df_eigen = subset(
		df,
		:A_format => ByRow(==("CSR")),
		:A_oindex => ByRow(==("int32")),
		:variant => ByRow(==(variant)),
		:impl_vendor => ByRow(==("Eigen")),
	)
	df_eigen_omp = find_best(subset(
		df_omp,
		:A_format => ByRow(==("CSR")),
		:A_oindex => ByRow(==("int32")),
		:variant => ByRow(==(variant)),
		:impl_vendor => ByRow(==("Eigen")),
	))
	df_eigen_omp.slide_desc = df_eigen_omp.A_scalar
	df_mkl_omp = copy(df_omp_csr_mkl_best, copycols=false)
	df_mkl_omp.slide_desc = df_mkl_omp.A_scalar
	draw_comparison(df_eigen_omp, df_mkl_omp)
	#draw_comparison(df_mkl_omp, df_eigen_omp)
end

# ╔═╡ a393c840-8ed5-46cd-a7f2-e4a5e4b416a0
md"""
## Naive vs Simple DA Implementation

Is it really important to resolve the index offsets before the loop,
i.e. with O(nrows) instead of O(nnz) integer operations overhead?
Or is the compiler able to do this?
"""

# ╔═╡ ddb5bcc8-0743-4af3-931f-8d59621fcbbe
let
	df_naive = subset(
		df,
		:A_format => ByRow(==("DA-CSR")),
		:A_oindex => ByRow(==("int32")),
		:A_iindex => ByRow(==("int16")),
		:variant => ByRow(==(variant)),
		:impl_desc => ByRow(==("naive")),
	)
	df_simple = subset(
		df,
		:A_format => ByRow(==("DA-CSR")),
		:A_oindex => ByRow(==("int32")),
		:A_iindex => ByRow(==("int16")),
		:variant => ByRow(==(variant)),
		:impl_desc => ByRow(==("simple")),
		:impl_nacc => ByRow(v -> !ismissing(v) && v == 1),
	)
	df_naive.slide_desc = df_naive.A_scalar
	df_simple.slide_desc = df_simple.A_scalar
	#draw_comparison(df_simple, df_naive)
	draw_comparison(df_naive, df_simple)
end

# ╔═╡ e1ce83df-edf2-4604-a7b5-e4c753ced87d
md"""
Above results are contrary to what I would expect,
so I have no reasonable explanation.
I would have expected the naive implementation to perform better for the diagonal matrices,
given that it performs one 32-bit integer addition instead of one 64-bit pointer addition to resolve the diagonally-addressed column index.
For the other matrices, I would have expected the naive implementation to perform worse,
given that it performs O(nnz) instead of O(nrows) additions to resolve the DA-indices.
However, the opposite is the case.
"""

# ╔═╡ Cell order:
# ╟─1dfc1459-3f2d-4e98-9fc9-ea60e6ae6118
# ╠═6b9e49ac-f2f7-11ed-3e20-614d6f3d4ca3
# ╠═562b755a-8ba6-4baf-bb8c-20d626556459
# ╠═3f530c87-08e8-466b-b07b-3ea4e9ce8cea
# ╟─ec06f9d6-53b3-455b-b46d-ed03dc29786f
# ╠═085557db-7b8e-4210-85c2-135d71eb7fbc
# ╟─174e38d2-519f-4e8c-8465-03af554ba397
# ╠═53acb72c-876e-4a2c-ad8d-c8d88fbfaf0b
# ╠═928d7a42-3e91-4c69-b1ef-6877ade1e95f
# ╠═23614ac3-8122-4817-9fa2-44236206bd3c
# ╟─c3fb960c-5f3e-4d41-b981-df3eb2951883
# ╠═8e09926c-73d7-42c8-ad7e-c7310ed2133a
# ╟─27f0deec-fc59-43e1-8576-9ae9901a6db7
# ╟─40b76c0c-eb34-4452-a4ed-5d7b4e2882de
# ╟─6cf42151-ad9b-4de8-976c-ac25659e595e
# ╟─ca2bf257-8ddd-4973-9161-291d0b7f7ad0
# ╟─b311ecac-e6cb-4d91-8900-22a3833f44f5
# ╠═425a6927-a0b8-4d64-bdc3-874ccac9654e
# ╟─3a9bd51d-05f9-41a7-967c-e6b2f12747af
# ╟─e7d34afe-1a45-4a9a-b500-fd8715eec20d
# ╟─039b68da-41e7-4ba5-bad0-00a6213a801b
# ╟─f70ea745-b234-45a1-91ca-cfff6c9f4b98
# ╟─a19bfbbc-24f9-4b2d-a238-5fcf0332822d
# ╟─e3bf2756-f8c3-4030-96c4-a918f65ffdf0
# ╟─f5edc2f3-9bbd-4c52-99ca-fce788cbb2b7
# ╠═0d2eecd5-c360-4f2d-b086-d93069a02b86
# ╠═fbfa40d4-0301-4016-8247-e90b7c6029db
# ╟─b2fd274f-ae8f-4a1e-a42c-2c9c583ceb41
# ╠═8dc6fdb4-e69a-4b56-8f97-1abe9c39e24a
# ╟─9ed53f3e-1c77-4698-abf4-31e64f8657a7
# ╟─f755bf50-420c-4530-8ed2-8cdaefd9a396
# ╟─ec3f483e-3a52-4408-9202-b33f8b5fa116
# ╟─b6fd7219-b58d-4040-8d26-592e8577b8ef
# ╟─c03d86d1-8cb3-42dc-b0bf-5f74f43c34ab
# ╟─0301ffdd-acda-42eb-acbe-1e6c6a2912e1
# ╟─d3c8100f-d709-4303-873f-15afd5171b25
# ╟─9445b409-3575-4c61-bdd8-871a4c796bdd
# ╟─3f91f320-97df-4af9-a5ba-9a4ef9c0b5d3
# ╠═504da979-4eb3-4440-a10e-b1dcaf8f0297
# ╠═a06b7b6e-1a47-491e-b50c-72cf2545a2c6
# ╠═c79796ed-bb85-44c1-a28e-e0b6b9a51d18
# ╠═b504291f-f784-415a-bbcc-55687817be11
# ╠═cae1ff23-5c6d-4eb4-a222-86ae32d629e2
# ╠═ea2de365-df5e-41fa-800f-3e54e762780a
# ╟─869f9809-176d-497d-b2a9-aa60b6a099d4
# ╟─715f116b-3e3d-49f6-bac5-4fc0de19976c
# ╟─4126074c-2627-46b7-b69c-4153341f571d
# ╟─1a137de6-c1b5-4a9e-bfa3-4fde39520198
# ╟─f95c5d6e-ef6a-4437-8d88-01986eabc8fd
# ╠═183d6803-1f79-4206-ba88-afc314b8eb42
# ╟─1634495e-fbcd-4396-9a4a-2bab0067c806
# ╟─39d8f5b5-84e8-410f-923f-44d72318da89
# ╟─ae4ab095-3261-4f02-b6e1-12dd2d6baeb2
# ╟─9cad5ef7-e0e9-4a6f-8bd0-1b6717ce52cb
# ╟─facbb67c-de04-4cc8-9d4f-fedf3d4ba079
# ╟─7efd75e6-2c6a-4a6a-bdfb-7f63fcf9c729
# ╟─348689a4-bfc8-46d0-ae83-7e06b13d8902
# ╟─dd21e13d-3440-43ca-8e3e-cd015178d3fb
# ╠═293885af-8657-4090-b087-c1b98dc6fba4
# ╟─3e50bb1a-83f1-47db-bf89-dd2c0aadba48
# ╟─93c6ebca-ce89-40a6-8eb6-01c70c51c521
# ╟─7bf21dce-dd37-4054-8876-0ddbd7260c79
# ╟─2637e32a-0c6e-4a92-aef9-297e41016692
# ╠═f653cb80-980a-4159-b84f-f59e6698b765
# ╠═51189ea0-e454-4709-aebb-5203c22d98b3
# ╟─669b9ce3-ab39-4999-b976-76e3f6a729af
# ╟─b111fb26-adb7-4b60-9f36-06771b587f5e
# ╟─35b2ffa3-76f6-405d-b9d9-9e2ff2b16453
# ╠═2a405e0c-dd41-4acc-a7e6-6d293ce82070
# ╠═efcf31b4-08c2-4bb8-8b09-ed9606135061
# ╟─cc538aeb-448d-43c7-93a2-724876695adf
# ╟─e3d7093f-51eb-45cb-a75f-8fafb63823e8
# ╟─ed778294-0fde-454c-80bf-499e860144f5
# ╠═f91e7737-b802-423f-8a17-080f72d847bb
# ╟─b37bf6e3-d412-43fc-9058-99710ff4d014
# ╟─080c4c28-3b62-492d-a2b5-461ec3866515
# ╟─eaa4e5fb-f7a3-4965-aa55-4f7d63456529
# ╟─a3f9b853-4ce7-42dc-8090-5f5958124942
# ╟─fd0bb4e4-2eec-48c1-b4a4-7be4aa8e7eeb
# ╠═7da59d41-918c-4875-a1a7-43c5f07e2d0a
# ╠═5d49aee6-7ce3-465a-baa2-18e07585cd5c
# ╟─7bd36293-02e0-4b79-8813-5552abfd8f7e
# ╠═2cfb4fde-2e53-4d08-b6e3-a6b82cab5209
# ╠═42236bfb-9dd7-4bb2-b0cc-018a662f3909
# ╠═5057c526-f0cc-496b-bf9d-fa446bba92e2
# ╟─0d5e2b30-58ca-4447-97e8-7b31bceeb0d6
# ╠═a57ead15-3c91-4caa-8005-cfd0f42dc154
# ╠═c884f3d8-29ff-47fd-81f1-8995f95cc25a
# ╠═20db2172-169e-42bc-8e03-649a19272e2a
# ╟─7a451dfb-7c2c-403e-a48b-c99776cddd45
# ╟─e924bcfd-4c51-42d3-a9ab-c9c2100ec63d
# ╟─8f54ba7d-21a1-48f6-b3ac-a038c0a9713f
# ╟─fae9bdfb-a571-449c-aef1-0a3267e24c89
# ╟─e9b820d3-9c1f-4ad7-8223-74541c8c7e6d
# ╟─645f4b95-e354-489b-a3dc-b56abd17652e
# ╟─3b734bcc-4202-4845-b824-5f77e86fa969
# ╟─78975aec-99d1-4bde-9b09-20b660e9d9ca
# ╟─45f79dd6-c337-45f1-b7ef-dd0016ca5620
# ╟─476b1301-ec73-4c24-80a8-78b4295121a3
# ╟─4c7d151f-9d79-454c-934e-a6298c65b788
# ╟─3e723e2c-0005-4a12-b1e4-b5e84aef9549
# ╟─fb4e29fb-c4ff-4155-bfbc-6d4b5f693892
# ╟─c741e5b1-7687-4ece-9b33-b63a9afb32e9
# ╟─3eac77a5-7a56-4907-a3e6-ba8278530745
# ╠═1bc8b9e8-fef4-4234-bbae-c9058ae23c5b
# ╠═e52cea5d-5a25-4a77-835e-9709e9ac18d5
# ╠═b97a6697-d678-4cf5-9ded-f6f2752c3815
# ╟─ed92e232-8604-486d-bea9-0cac08b11605
# ╠═d95249c8-39ab-4240-a3ee-503832ef7597
# ╠═8af55781-b53d-41fa-8a0d-ed14f5454729
# ╟─e037f27e-70c0-473c-9339-86c00ea5e6b6
# ╠═0ed9c245-0583-4e89-b9b1-8f90ba7d170f
# ╟─6e113127-a2fa-4a63-b42b-c77887af3202
# ╟─e923e717-2273-4c26-aed5-8fecc994d037
# ╟─9e388fa0-ac35-4914-9838-77faa366f222
# ╟─c1747831-b53a-4a89-a200-7bfa886323e1
# ╟─ed4b6a1e-99cf-4544-aa3c-57d8afa6710b
# ╟─aeaac19a-bf53-464a-b632-cb110ef487be
# ╟─797250b7-f192-4dca-9e8a-288828ce259f
# ╟─3384cf73-67ad-4554-b8b3-dd9aee7b89a1
# ╟─5555bff6-bf59-4bc0-b02b-48b41e66c212
# ╠═c4897041-d92e-4f4c-8542-6a92b0781dc9
# ╠═f7645262-e9dc-44d5-b297-a308e57f04a8
# ╟─832379fd-a286-48d7-a220-c8fd7f4a9c21
# ╠═9e38efae-6db8-4afc-af1d-be33c9de8096
# ╠═2c971eec-30c3-4f06-b22b-58961a792f61
# ╠═6da0a70d-6415-429f-8798-e52f184eba5d
# ╠═6aab4d99-140a-402c-8350-8e7a94993315
# ╠═b5584593-7277-4eca-9205-5e0a16b257e0
# ╠═666c70c1-4aa4-4288-ab9b-014efda8c501
# ╠═20d567a8-43d8-4916-bd1d-db1bc849a706
# ╠═de5d79c5-98c0-4ffd-a331-aa1a82b2d3b0
# ╟─4963b6f9-9ad4-42a7-8d72-e8bedbde68eb
# ╟─f5279105-83b6-4a60-9582-84e7ab9d793d
# ╟─f54e1e06-f62c-4787-8783-7a22624f1c04
# ╟─2ffaf292-b31f-460b-8d40-67b5a5162ad4
# ╠═52cad7b5-866e-4098-81c2-d6ea22b308e1
# ╠═80ba86a3-dddd-4fbb-bad8-046fdafae7f4
# ╠═706e076b-fb66-4cd3-972d-b6a6f907528e
# ╠═56cebf10-cd38-4db2-940c-ee50dd18d84c
# ╟─5fd9386c-0053-4a1f-8b38-da4735dfe601
# ╟─cabf7736-0228-43ad-987d-d422cf929c16
# ╟─2c9b2424-2d31-46f7-80dc-e47b1d758501
# ╟─2a700192-842b-4d11-80d6-0616bd756964
# ╠═a97f20c7-7587-4854-8249-638c159d8f04
# ╟─b09bb2c2-5a27-4706-a789-036bfb62fbc5
# ╠═60a52995-fd4f-4991-b5d0-fb9b13b58204
# ╟─80561378-e841-45cb-b48a-0ac2731f1aeb
# ╟─a32c52e4-fe54-4cdc-a8fb-8bba71e99e84
# ╟─2051f633-9e71-4435-938c-0d55f422c305
# ╟─468ac04f-1e94-49eb-aea9-6d5c46511971
# ╟─1682dc34-08e4-40f5-a8c3-ab19c9106ddc
# ╟─16221009-0aff-493d-8d4a-5d3dce800399
# ╟─1ebfa3d0-c873-429e-9371-36dc2b4ded6f
# ╟─cb01957c-8743-495c-8dcd-4cd25520a5ba
# ╟─69c0e5a1-7d93-41b3-9414-e56bfe1e4d0f
# ╠═0a250130-3c08-4f28-89e9-2921327af919
# ╟─40b4af69-d7ff-4098-b0ec-2560561a629f
# ╠═fdc0a2ad-56ec-4a24-9bb7-0d3643985146
# ╠═46d43c34-fa41-49fe-94b1-0f0055cac140
# ╟─7ebed273-5400-4031-aea6-d95ca3b87ef2
# ╠═21093972-89fd-466f-93e0-9a0e7a3d555f
# ╠═4982c0d0-09bd-4791-a5ac-ace5a7b33994
# ╠═8c91ff03-b9bc-4bf2-9838-94c9bb90a4ea
# ╟─93b4e1ad-ed80-4d77-8176-29aa29b79d56
# ╟─1e651f68-9011-450b-a44c-5d4e37c5cfeb
# ╠═50b0d49e-e3cd-40bb-bf34-936eaddbb052
# ╟─2e87451a-8271-472a-930f-8a1012843889
# ╟─eaf78014-0fe4-4e57-8a4c-a7546fc184eb
# ╟─68fcf117-9465-4ac2-91b0-591712180e0d
# ╟─4141c689-741b-48c1-9719-30affea3026d
# ╟─2ca6e915-8db1-4187-8b33-ba73654495b8
# ╟─677e4957-834b-4991-bc76-0f4a2a953ebc
# ╟─ee30a184-7fe3-4ea9-af0e-06c3678ae832
# ╠═4e3d793e-921c-4674-b125-58184f0fc6be
# ╠═65488b5d-effb-4d5a-90cd-a845ba920ee1
# ╟─f196cc7a-1e18-4968-8bf7-b62aa34d2ad2
# ╠═eae290ac-e2ca-470f-ab80-d2cc31debbf7
# ╟─0cf3b21c-1900-4d57-aa18-c7023c55ca94
# ╟─3b60e4f3-fbc8-4f62-84f1-667fd777371e
# ╟─b4ff2923-3dd9-46dc-bcd8-12462a16e5c6
# ╟─82c7bc2d-d577-4234-8be4-269346cd0f81
# ╟─0a64fca3-0351-4f78-ac74-4af87941a82d
# ╟─3d7de276-05a5-49ee-b0fe-0b928a0aa57d
# ╟─c890a64f-1011-4124-8f57-028339ccf4c0
# ╟─6f2d2c92-b279-4a00-bf63-78a5aa1ce35e
# ╟─28e3172a-6356-4e92-998c-d1add7b18fb1
# ╠═799319b5-fe95-42d2-b44c-107c0f18c790
# ╠═cd75fb8c-9040-49db-83dc-e7c8ab5a7103
# ╟─2731d186-8248-43e5-8b6a-a49316d7517e
# ╠═487c0837-af13-4483-8858-da4542187e86
# ╟─1f1795ef-5f25-4be6-a7a9-de71c98dc1f0
# ╟─1a255baf-a61b-44fb-a2d3-9b73e944a5da
# ╟─1a2415a2-ef35-4024-82a6-c158caeb69b5
# ╟─19366ee0-172e-48be-8108-20466d202d62
# ╟─2fc7d727-bc7d-484d-ae84-38a6a6d39994
# ╟─15d2e0b1-e3f1-4e1f-8a65-049d90d441b5
# ╟─9344a0bb-773a-4ecf-924c-05a03f5393bc
# ╟─57ef4e3d-12fe-4367-9d8c-9d32fd430e4f
# ╟─0aae5ee1-3b16-4998-87b9-17a709e3e3b8
# ╠═509af447-81f1-417c-91cc-38eb3ba53c34
# ╟─d6e98f18-7e3e-437b-8050-fb432a7038c1
# ╟─b3844c5b-fec5-4cac-84b0-7e39d92779e0
# ╟─32a331e9-09bc-4384-ac60-de4f256c3ca3
# ╟─f16be015-274d-45ab-a4f8-940b2bd92257
# ╠═8f1229ea-5bab-41c9-9df3-297b003aaaae
# ╟─3c20500a-6f84-4365-ad29-89bc28bbb902
# ╠═cfe37852-631e-46bc-b66b-e498b07bb87c
# ╠═8d962291-1e09-4537-a0d9-f501f68204f7
# ╠═3464c7e7-a4fd-4a68-988e-10e5c46f4963
# ╠═e38339a2-67dc-4dc7-8526-4246be1961a5
# ╠═f7fc74d0-e9da-4bed-a761-1a8fc903d73c
# ╠═a77826bb-cb81-480f-87a0-0fda0c7574e8
# ╠═72718d96-5560-42d9-b481-3ee7f5d6e000
# ╠═c5394a1b-71c9-4036-a6c6-9fbfc6e19919
# ╠═f7f8d346-48d4-4fa1-a8e5-906dfd287ad8
# ╠═e7252665-e6b9-4b94-bd32-e457fba186a2
# ╠═8e090ee9-6c76-4d4c-a824-7daf13a297c9
# ╠═ec22cfed-8886-46e8-83ce-07e45e90855a
# ╠═e61dab3e-18ca-4cb3-8302-0df674db512a
# ╠═baf6d4f6-192b-4a23-8075-828bbc447254
# ╠═9d2e9033-b197-40e4-bb35-d7a7b2f2de81
# ╠═104c83cc-93d5-43e9-b1ce-5270b11c3f78
# ╠═beba8f6d-ad04-404e-9b69-b670f9d119ec
# ╠═b2da69cd-38ed-4516-b7e4-17013462ace5
# ╟─86afd175-eed8-497d-bcd9-76e1f955eed0
# ╟─c6c36217-2dd2-44c7-9676-236d9cf35981
# ╟─37214219-e656-4f18-87d5-2b85a9abf270
# ╟─b3b326bd-36d9-412f-ad66-fc5dcda2fb56
# ╠═33f6450a-7f65-40e6-a69f-346ec37c75f6
# ╟─1865065c-144a-4c2d-a548-3c0c38c6fa20
# ╟─1a4051a5-64ac-4f3f-9fce-749aa988a855
# ╟─4f04194d-d766-45c1-91bf-6b7bbd47e5f7
# ╟─ccc215b0-afde-4dc1-907a-853bdb98fe2a
# ╟─4b5f30f0-7afc-4645-a08a-049f8ea3f3c1
# ╟─1fe4233f-b613-4618-8604-60422f9daab4
# ╟─df82699b-3aa3-4cab-8aa0-f2409a4dccf6
# ╟─b2ee7359-8d59-40a1-9c7f-8c5c4553f7f2
# ╟─a393c840-8ed5-46cd-a7f2-e4a5e4b416a0
# ╟─ddb5bcc8-0743-4af3-931f-8d59621fcbbe
# ╟─e1ce83df-edf2-4604-a7b5-e4c753ced87d
