Synthetic models

You are seeing the HTML output generated by Documenter.jl and Literate.jl from the Julia source file. The corresponding notebook can be viewed in nbviewer, and the plain script output can be found here.

Note

If you want to run the experiments, make sure you have an identical environment. Please use Julia 1.5.3 and activate and instantiate the environment using this Project.toml file and this Manifest.toml file.

The Github repository contains more detailed instructions and a nix project environment with a pinned Julia binary for improved reproducibility.

Packages

using CSV
using CairoMakie
using CalibrationErrors
using CalibrationErrorsDistributions
using CalibrationTests
using DataFrames
using Distributions
using FillArrays
using ProgressLogging
using Query
using Showoff
using StatsBase

using LinearAlgebra
using Printf
using Random

using CairoMakie.AbstractPlotting.ColorSchemes: Dark2_8

using Logging: with_logger
using TerminalLoggers: TerminalLogger

# set random seed
Random.seed!(1234)

# create path before saving
function wsavefig(file, fig=current_figure())
    mkpath(dirname(file))
    return save(file, fig)
end

# define progress logging frontend
const PROGRESSLOGGER = TerminalLogger()

# define non-intrusive plotting style
set_theme!(
    Theme(;
        Axis=(
            rightspinevisible=false,
            topspinevisible=false,
            xgridvisible=false,
            ygridvisible=false,
        ),
        Legend=(framevisible=false,),
    ),
)

Synthetic models

We study two setups with $d$-dimensional targets $Y$ and normal distributions $P_X$ of the form $\mathcal{N}(c \mathbf{1}_d, 0.1^2 \mathbf{I}_d)$ as predictions, where $c \sim \mathrm{U}(0, 1)$. Since calibration analysis is only based on the targets and predicted distributions, we neglect features $X$ in these experiments and specify only the distributions of $Y$ and $P_X$.

Calibrated setup

In the first setup we simulate a calibrated model. We achieve this by sampling targets from the predicted distributions, i.e., by defining the conditional distribution of $Y$ given $P_X$ as

\[Y \,|\, P_X = \mathcal{N}(\mu, \Sigma) \sim \mathcal{N}(\mu, \Sigma).\]

function calibrated_model(dim::Int, nsamples::Int)
    # sample predictions
    predictions = [MvNormal(Fill(rand(), dim), 0.1) for _ in 1:nsamples]

    # sample targets
    targets = map(rand, predictions)

    return predictions, targets
end
calibrated_model (generic function with 1 method)

Uncalibrated setup

In the second setup we simulate an uncalibrated model of the form

\[Y \,|\, P_X = \mathcal{N}(\mu, \Sigma) \sim \mathcal{N}([0.1, \mu_2, \ldots, \mu_d], \Sigma).\]

function uncalibrated_model(dim::Int, nsamples::Int)
    # sample predictions
    predictions = [MvNormal(Fill(rand(), dim), 0.1) for _ in 1:nsamples]

    # sample targets
    targets = map(rand, predictions)
    altdist = Normal(0.1, 0.1)
    for t in targets
        t[1] = rand(altdist)
    end

    return predictions, targets
end
uncalibrated_model (generic function with 1 method)

Convergence and computation time of estimators

We perform an evaluation of the convergence and computation time of the biased estimator $\widehat{\mathrm{SKCE}}_k$, the unbiased estimator $\widehat{\mathrm{SKCE}}_{k,B}$ with blocks of size $B \in \{2, \sqrt{n}, n\}$. We use the tensor product kernel

\[\begin{aligned} k\big((p, y), (p', y')\big) &= \exp{\big(- W_2(p, p')\big)} \exp{\big(-(y - y')^2/2\big)} \\ &= \exp{\big(-\sqrt{(m_p - m_{p'})^2 + (\sigma_p - \sigma_{p'})^2}\big)} \exp{\big( - (y - y')^2/2\big)}, \end{aligned}\]

where $W_2$ is the 2-Wasserstein distance and $m_p, m_{p'}$ and $\sigma_p, \sigma_{p'}$ denote the mean and the standard deviation of the normal distributions $p$ and $p'$.

Ground truth

For both models, we have to "evaluate" the true calibration error. Generally, the error depends on the model (and hence also dimension $d$) and the kernel. If the model is calibrated, we know that the calibration error is zero. For the uncalibrated model, we estimate the ground truth with the minimum-variance unbiased estimator as the mean of SKCE estimates for 1000 randomly sampled datasets with 1000 data points.

true_SKCE(::typeof(calibrated_model), kernel; dim::Int) = 0.0
function true_SKCE(model::typeof(uncalibrated_model), kernel; dim::Int)
    estimator = UnbiasedSKCE(kernel)
    return mean(calibrationerror(estimator, model(dim, 1_000)...) for _ in 1:1_000)
end
true_SKCE (generic function with 2 methods)

Benchmarking

The following two functions implement the benchmarking. We sample 500 datasets of 4, 16, 64, 256, and 1024 data points each for the models of dimensions $d=1$ and $d=10$. For each of the datasets, we evaluate the different SKCE estimators. We compute the mean absolute error, the variance, and the minimum computation time for the estimates, grouped by the dimension of the model and the number of samples in the dataset.

function benchmark_estimator(estimator, model; dim::Int, nsamples::Int, groundtruth)
    # compute the estimator (potentially depending on number of samples)
    _estimator = estimator(nsamples)

    # cache for calibration error estimates
    estimates = Vector{Float64}(undef, 500)

    mintime = Inf

    name = @sprintf("benchmarking (dim = %2d, nsamples = %4d)", dim, nsamples)
    @progress name = name for i in eachindex(estimates)
        # sample predictions and targets
        predictions, targets = model(dim, nsamples)

        # define benchmark function
        benchmark_f =
            let estimator = _estimator, predictions = predictions, targets = targets
                () -> @timed calibrationerror(estimator, predictions, targets)
            end

        # precompile function
        benchmark_f()

        # compute calibration error and obtain elapsed time
        val, t = benchmark_f()

        # only keep minimum execution time
        mintime = min(mintime, t)

        # save error estimate
        estimates[i] = val
    end

    # save the mean absolute deviation and the variance of the estimates
    meanerror = mean(abs(x - groundtruth) for x in estimates)
    variance = var(estimates)

    return (; dim, nsamples, meanerror, variance, mintime)
end

function benchmark_estimators(model)
    # output file
    filename = joinpath("data", "synthetic", "errors_$(model).csv")

    # check if results exist
    isfile(filename) && return DataFrame(CSV.File(filename))

    # define kernel
    kernel = WassersteinExponentialKernel() ⊗ SqExponentialKernel()

    # define estimators
    estimators = (
        "SKCE" => _ -> BiasedSKCE(kernel),
        "SKCE (B = 2)" => _ -> BlockUnbiasedSKCE(kernel, 2),
        "SKCE (B = √n)" => n -> BlockUnbiasedSKCE(kernel, max(2, Int(floor(sqrt(n))))),
        "SKCE (B = n)" => _ -> UnbiasedSKCE(kernel),
    )

    # define number of samples
    nsamples = 2 .^ (2:2:10)

    # ensure that output directory exists and open file for writing
    mkpath(dirname(filename))
    open(filename, "w") do file
        # write headers
        println(file, "estimator,dim,nsamples,meanerror,variance,mintime")

        # for dimensions ``d=1`` and ``d=10``
        for d in (1, 10)
            # compute/estimate ground truth
            groundtruth = true_SKCE(model, kernel; dim=d)

            for (i, (name, estimator)) in enumerate(estimators)
                # benchmark estimator
                @info "benchmarking estimator: $(name)"

                for n in nsamples
                    stats = benchmark_estimator(
                        estimator, model; dim=d, nsamples=n, groundtruth=groundtruth
                    )

                    # save statistics
                    print(file, name, ",")
                    join(file, stats, ",")
                    println(file)
                end
            end
        end
    end

    # load results
    return DataFrame(CSV.File(filename))
end
benchmark_estimators (generic function with 1 method)

We benchmark the estimators with the calibrated model.

Random.seed!(100)
with_logger(PROGRESSLOGGER) do
    benchmark_estimators(calibrated_model)
end

40 rows × 6 columns

estimatordimnsamplesmeanerrorvariancemintime
StringInt64Int64Float64Float64Float64
1SKCE140.002326465.40611e-68.39e-7
2SKCE1160.0006127493.25968e-71.1546e-5
3SKCE1640.0001608062.0837e-80.000182119
4SKCE12563.87249e-51.37054e-90.00293836
5SKCE110249.62075e-67.10919e-110.0473988
6SKCE (B = 2)140.003224842.14091e-52.48e-7
7SKCE (B = 2)1160.001789765.51153e-61.318e-6
8SKCE (B = 2)1640.0008233841.11883e-63.404e-6
9SKCE (B = 2)12560.0004802823.55902e-71.3808e-5
10SKCE (B = 2)110240.0002353868.60559e-85.5695e-5
11SKCE (B = √n)140.003292012.34735e-52.34e-7
12SKCE (B = √n)1160.001059961.96389e-62.186e-6
13SKCE (B = √n)1640.0003423141.86192e-72.014e-5
14SKCE (B = √n)12560.0001212042.31226e-80.000173367
15SKCE (B = √n)110244.00685e-52.66636e-90.00143577
16SKCE (B = n)140.001839838.07962e-65.37e-7
17SKCE (B = n)1160.0004175133.18398e-71.0872e-5
18SKCE (B = n)1649.96033e-51.92571e-80.000178632
19SKCE (B = n)12562.48102e-51.25886e-90.00291616
20SKCE (B = n)110246.07345e-67.73514e-110.0471831
21SKCE1040.02361223.94317e-51.104e-6
22SKCE10160.005905771.67794e-61.6572e-5
23SKCE10640.001473889.22523e-80.000266736
24SKCE102560.0003685785.66692e-90.00428609
25SKCE1010249.10077e-53.19037e-100.0685317
26SKCE (B = 2)1040.006150178.17789e-52.96e-7
27SKCE (B = 2)10160.003275161.83537e-51.162e-6
28SKCE (B = 2)10640.001816485.34779e-64.901e-6
29SKCE (B = 2)102560.0009627581.48284e-62.0093e-5
30SKCE (B = 2)1010240.0004620923.33079e-78.167e-5
31SKCE (B = √n)1040.005935968.45258e-53.07e-7
32SKCE (B = √n)10160.002167617.10482e-63.296e-6
33SKCE (B = √n)10640.0006660487.00752e-73.0625e-5
34SKCE (B = √n)102560.0002463479.09186e-80.000265018
35SKCE (B = √n)1010248.47757e-51.10709e-80.00208533
36SKCE (B = n)1040.004103282.93761e-57.2e-7
37SKCE (B = n)10160.0009556651.41091e-61.5352e-5
38SKCE (B = n)10640.0002317358.77091e-80.000260414
39SKCE (B = n)102565.8408e-55.31428e-90.00424718
40SKCE (B = n)1010241.4255e-53.31928e-100.0682267

We repeat the benchmark with the uncalibrated model.

Random.seed!(100)
with_logger(PROGRESSLOGGER) do
    benchmark_estimators(uncalibrated_model)
end

40 rows × 6 columns

estimatordimnsamplesmeanerrorvariancemintime
StringInt64Int64Float64Float64Float64
1SKCE140.07513420.009961458.4e-7
2SKCE1160.03145640.001692311.1676e-5
3SKCE1640.01591830.0003945780.000184502
4SKCE12560.008026890.0001022520.00295501
5SKCE110240.004231352.77658e-50.047851
6SKCE (B = 2)140.0787770.009829053.54e-7
7SKCE (B = 2)1160.04061320.002466628.83e-7
8SKCE (B = 2)1640.01942010.0006001673.699e-6
9SKCE (B = 2)12560.009875220.000159091.4895e-5
10SKCE (B = 2)110240.005113784.13817e-56.0314e-5
11SKCE (B = √n)140.08111770.01026052.35e-7
12SKCE (B = √n)1160.03428510.001940942.25e-6
13SKCE (B = √n)1640.01741020.0004565072.0407e-5
14SKCE (B = √n)12560.008346390.0001077210.000175991
15SKCE (B = √n)110240.004149022.71046e-50.00146066
16SKCE (B = n)140.07141180.008212495.37e-7
17SKCE (B = n)1160.03427230.001878791.0352e-5
18SKCE (B = n)1640.01692610.0004375770.000178923
19SKCE (B = n)12560.008385660.0001062680.00295301
20SKCE (B = n)110240.004264782.90507e-50.047773
21SKCE1040.06051640.004289381.113e-6
22SKCE10160.02340320.0006684171.6879e-5
23SKCE10640.01031310.0001544950.000271091
24SKCE102560.004988413.99274e-50.00435522
25SKCE1010240.002359859.21217e-60.0694433
26SKCE (B = 2)1040.05443820.004977032.99e-7
27SKCE (B = 2)10160.02839760.001216631.177e-6
28SKCE (B = 2)10640.01339770.0002902394.817e-6
29SKCE (B = 2)102560.006872617.63347e-51.9724e-5
30SKCE (B = 2)1010240.003593831.99858e-57.9659e-5
31SKCE (B = √n)1040.05670950.005494652.99e-7
32SKCE (B = √n)10160.02255490.0008465453.13e-6
33SKCE (B = √n)10640.01025160.0001727332.9899e-5
34SKCE (B = √n)102560.005602274.84788e-50.000258742
35SKCE (B = √n)1010240.002303998.42511e-60.00214255
36SKCE (B = n)1040.04344480.003355967.28e-7
37SKCE (B = n)10160.01889460.0005842981.5216e-5
38SKCE (B = n)10640.009949340.0001508850.000263802
39SKCE (B = n)102560.00533464.57406e-50.00430886
40SKCE (B = n)1010240.002534579.91621e-60.0692274

Visualization

We show a visualization of the results below.

function logtickformat(base::Int)
    function format(values)
        return map(Base.Fix2(logformat, base), showoff(values))
    end
    return format
end

function logformat(digits::String, base::Int)
    buf = IOBuffer()
    print(buf, base)
    for c in digits
        if '0' ≤ c ≤ '9'
            print(buf, Showoff.superscript_numerals[c - '0' + 1])
        elseif c == '-'
            print(buf, '⁻')
        elseif c == '.'
            print(buf, '·')
        end
    end
    return String(take!(buf))
end

function plot_benchmark_estimators(model; dim::Int)
    # load and preprocess data
    filename = joinpath("data", "synthetic", "errors_$(model).csv")
    groups = @from i in DataFrame(CSV.File(filename)) begin
        @where i.dim == dim
        @orderby i.nsamples
        @select {
            i.estimator,
            log2_nsamples = log2(i.nsamples),
            log10_meanerror = log10(i.meanerror),
            log10_variance = log10(i.variance),
            log10_mintime = log10(i.mintime),
        }
        @collect DataFrame
    end

    # create figure
    fig = Figure(; resolution=(960, 800))

    # create axes to plot mean error and variance vs number of samples
    ax1 = Axis(
        fig[1, 1];
        xlabel="# samples",
        ylabel="mean error",
        xticks=2:2:10,
        xtickformat=logtickformat(2),
        ytickformat=logtickformat(10),
    )
    ax2 = Axis(
        fig[2, 1];
        xlabel="# samples",
        ylabel="variance",
        xticks=2:2:10,
        xtickformat=logtickformat(2),
        ytickformat=logtickformat(10),
    )

    # create axes to plot mean error and variance vs timings
    ax3 = Axis(
        fig[1, 2];
        xlabel="time [s]",
        ylabel="mean error",
        xtickformat=logtickformat(10),
        ytickformat=logtickformat(10),
    )
    ax4 = Axis(
        fig[2, 2];
        xlabel="time [s]",
        ylabel="variance",
        xtickformat=logtickformat(10),
        ytickformat=logtickformat(10),
    )

    # plot benchmark results
    estimators = ["SKCE", "SKCE (B = 2)", "SKCE (B = √n)", "SKCE (B = n)"]
    markers = ['●', '■', '▲', '◆']
    for (i, (estimator, marker)) in enumerate(zip(estimators, markers))
        group = filter(:estimator => ==(estimator), groups)
        color = Dark2_8[i]

        # plot mean error vs samples
        scatterlines!(
            ax1,
            group.log2_nsamples,
            group.log10_meanerror;
            color=color,
            linewidth=2,
            marker=marker,
            markercolor=color,
        )

        # plot variance vs samples
        scatterlines!(
            ax2,
            group.log2_nsamples,
            group.log10_variance;
            color=color,
            linewidth=2,
            marker=marker,
            markercolor=color,
        )

        # plot mean error vs time
        scatterlines!(
            ax3,
            group.log10_mintime,
            group.log10_meanerror;
            color=color,
            linewidth=2,
            marker=marker,
            markercolor=color,
        )

        # plot variance vs time
        scatterlines!(
            ax4,
            group.log10_mintime,
            group.log10_variance;
            color=color,
            linewidth=2,
            marker=marker,
            markercolor=color,
        )
    end

    # link axes and hide decorations
    linkxaxes!(ax1, ax2)
    hidexdecorations!(ax1)
    linkxaxes!(ax3, ax4)
    hidexdecorations!(ax3)
    linkyaxes!(ax1, ax3)
    hideydecorations!(ax3)
    linkyaxes!(ax2, ax4)
    hideydecorations!(ax4)

    # add legend
    elems = map(1:length(estimators)) do i
        [
            LineElement(; color=Dark2_8[i], linestyle=nothing, linewidth=2),
            MarkerElement(; color=Dark2_8[i], marker=markers[i], strokecolor=:black),
        ]
    end
    Legend(fig[end + 1, :], elems, estimators; orientation=:horizontal, tellheight=true)

    return fig
end
plot_benchmark_estimators (generic function with 1 method)

We obtain the following plots:

plot_benchmark_estimators(calibrated_model; dim=1)
wsavefig("figures/synthetic/estimators_calibrated_model_dim=1.svg");

plot_benchmark_estimators(calibrated_model; dim=10)
wsavefig("figures/synthetic/estimators_calibrated_model_dim=10.svg");

plot_benchmark_estimators(uncalibrated_model; dim=1)
wsavefig("figures/synthetic/estimators_uncalibrated_model_dim=1.svg");

plot_benchmark_estimators(uncalibrated_model; dim=10)
wsavefig("figures/synthetic/estimators_uncalibrated_model_dim=10.svg");

Test errors and computation time of calibration tests

We fix the significance level $\alpha = 0.05$. Test predictions are sampled from the same distribution as $P_X$, and test targets are sampled independently from $\mathcal{N}(0, 0.1^2 \mathbf{I}_d)$.

Benchmarking

iscalibrated(::typeof(calibrated_model)) = true
iscalibrated(::typeof(uncalibrated_model)) = false

function benchmark_test(test, model; dim::Int, nsamples::Int)
    # number of simulations
    nrepeat = 500

    # initial values
    ntesterrors = 0
    mintime = Inf

    name = @sprintf("benchmarking (dim = %2d, nsamples = %4d)", dim, nsamples)
    @progress name = name for _ in 1:nrepeat
        # sample predictions and targets
        predictions, targets = model(dim, nsamples)

        # define benchmark function
        benchmark_f = let test = test, predictions = predictions, targets = targets
            () -> @timed pvalue(test(predictions, targets))
        end

        # precompile function
        benchmark_f()

        # compute calibration error and obtain elapsed time
        val, t = benchmark_f()

        # only keep minimum execution time
        mintime = min(mintime, t)

        # update number of empirical test errors for
        # significance level ``\alpha = 0.05``
        ntesterrors += iscalibrated(model) ⊻ (val ≥ 0.05)
    end

    # compute empirical test error rate
    testerror = ntesterrors / nrepeat

    return (; dim, nsamples, testerror, mintime)
end

function benchmark_tests(model)
    # output file
    filename = joinpath("data", "synthetic", "tests_$(model).csv")

    # check if results exist
    isfile(filename) && return DataFrame(CSV.File(filename))

    # define kernel
    kernel = WassersteinExponentialKernel() ⊗ SqExponentialKernel()

    # define number of samples
    nsamples = 2 .^ (2:2:10)

    # ensure that output directory exists and open file for writing
    mkpath(dirname(filename))
    open(filename, "w") do file
        # write headers
        println(file, "test,dim,nsamples,testerror,mintime")

        # for dimensions ``d=1`` and ``d=10``
        for d in (1, 10)
            # define tests
            testpredictions = [MvNormal(rand(d), 0.1) for _ in 1:10]
            testtargets = [rand(MvNormal(d, 0.1)) for _ in 1:10]
            tests = (
                "SKCE (B = 2)" =>
                    (predictions, targets) -> AsymptoticBlockSKCETest(
                        BlockUnbiasedSKCE(kernel, 2), predictions, targets
                    ),
                "SKCE (B = √n)" =>
                    (predictions, targets) -> AsymptoticBlockSKCETest(
                        BlockUnbiasedSKCE(kernel, Int(floor(sqrt(length(predictions))))),
                        predictions,
                        targets,
                    ),
                "SKCE (B = n)" =>
                    (predictions, targets) ->
                        AsymptoticSKCETest(kernel, predictions, targets),
                "CME" =>
                    (predictions, targets) -> AsymptoticCMETest(
                        UCME(kernel, testpredictions, testtargets), predictions, targets
                    ),
            )

            for (i, (name, test)) in enumerate(tests)
                # benchmark estimator
                @info "benchmarking test: $(name)"

                for n in nsamples
                    stats = benchmark_test(test, model; dim=d, nsamples=n)

                    # save statistics
                    print(file, name, ",")
                    join(file, stats, ",")
                    println(file)
                end
            end
        end
    end

    # load results
    return DataFrame(CSV.File(filename))
end
benchmark_tests (generic function with 1 method)

First we benchmark the calibrated model.

Random.seed!(100)
with_logger(PROGRESSLOGGER) do
    benchmark_tests(calibrated_model)
end

40 rows × 5 columns

testdimnsamplestesterrormintime
StringInt64Int64Float64Float64
1SKCE (B = 2)140.1281.296e-6
2SKCE (B = 2)1160.054.202e-6
3SKCE (B = 2)1640.0541.7021e-5
4SKCE (B = 2)12560.0546.4322e-5
5SKCE (B = 2)110240.060.000133484
6SKCE (B = √n)140.1341.304e-6
7SKCE (B = √n)1160.0385.944e-6
8SKCE (B = √n)1640.034.2991e-5
9SKCE (B = √n)12560.020.000180911
10SKCE (B = √n)110240.020.00145807
11SKCE (B = n)140.1428.6026e-5
12SKCE (B = n)1160.0580.00045643
13SKCE (B = n)1640.040.00403782
14SKCE (B = n)12560.060.0504854
15SKCE (B = n)110240.0561.04588
16CME140.531.4472e-5
17CME1160.8822.8276e-5
18CME1640.3566.2975e-5
19CME12560.1180.000190426
20CME110240.0680.000668129
21SKCE (B = 2)1040.1028.0e-7
22SKCE (B = 2)10160.0522.559e-6
23SKCE (B = 2)10640.0641.002e-5
24SKCE (B = 2)102560.0484.0336e-5
25SKCE (B = 2)1010240.0480.000163456
26SKCE (B = √n)1040.1089.71e-7
27SKCE (B = √n)10160.0664.368e-6
28SKCE (B = √n)10640.0623.2725e-5
29SKCE (B = √n)102560.050.000274685
30SKCE (B = √n)1010240.0420.00224606
31SKCE (B = n)1040.0048.8992e-5
32SKCE (B = n)10160.00.000483322
33SKCE (B = n)10640.010.00431849
34SKCE (B = n)102560.0420.0513581
35SKCE (B = n)1010240.0440.796199
36CME1040.4782.4278e-5
37CME10160.764.922e-5
38CME10640.1527.793e-5
39CME102560.0580.000260368
40CME1010240.0740.000988811

We repeat the analysis with the uncalibrated model.

Random.seed!(100)
with_logger(PROGRESSLOGGER) do
    benchmark_tests(uncalibrated_model)
end

40 rows × 5 columns

testdimnsamplestesterrormintime
StringInt64Int64Float64Float64
1SKCE (B = 2)140.6487.8e-7
2SKCE (B = 2)1160.1224.424e-6
3SKCE (B = 2)1640.01.714e-5
4SKCE (B = 2)12560.06.9183e-5
5SKCE (B = 2)110240.00.000138376
6SKCE (B = √n)140.6541.364e-6
7SKCE (B = √n)1160.0463.122e-6
8SKCE (B = √n)1640.02.3081e-5
9SKCE (B = √n)12560.00.000190658
10SKCE (B = √n)110240.00.00156119
11SKCE (B = n)140.2728.8509e-5
12SKCE (B = n)1160.00.000477038
13SKCE (B = n)1640.00.00401854
14SKCE (B = n)12560.00.0500722
15SKCE (B = n)110240.01.10676
16CME140.4781.5267e-5
17CME1160.02.5804e-5
18CME1640.05.7052e-5
19CME12560.00.000181037
20CME110240.00.000668172
21SKCE (B = 2)1040.7487.82e-7
22SKCE (B = 2)10160.3082.543e-6
23SKCE (B = 2)10640.09.69e-6
24SKCE (B = 2)102560.07.4793e-5
25SKCE (B = 2)1010240.00.000164
26SKCE (B = √n)1040.7349.5e-7
27SKCE (B = √n)10160.1484.326e-6
28SKCE (B = √n)10640.05.529e-5
29SKCE (B = √n)102560.00.000460137
30SKCE (B = √n)1010240.00.00226211
31SKCE (B = n)1040.7128.8602e-5
32SKCE (B = n)10160.0020.000483276
33SKCE (B = n)10640.00.00431563
34SKCE (B = n)102560.00.0541269
35SKCE (B = n)1010240.00.763799
36CME1040.4581.8213e-5
37CME10160.0043.1897e-5
38CME10640.07.844e-5
39CME102560.00.00026352
40CME1010240.00.000995937

Visualization

Again we visualize the results of our benchmarks. However, this time we compare the results for the calibrated and the uncalibrated model in the same plot.

function plot_benchmark_tests(; dim::Int)
    # load and preprocess data
    df = mapreduce(vcat, (calibrated_model, uncalibrated_model)) do model
        filename = joinpath("data", "synthetic", "tests_$(model).csv")
        df = DataFrame(CSV.File(filename))
        df[!, :model] .= string(model)
        return df
    end
    groups = @from i in df begin
        @where i.dim == dim
        @orderby i.nsamples
        @select {
            i.test,
            i.model,
            log2_nsamples = log2(i.nsamples),
            i.testerror,
            log10_mintime = log10(i.mintime),
        }
        @collect DataFrame
    end

    # create figure
    fig = Figure(; resolution=(960, 400))

    # add labels
    Label(fig[1:2, 1], "empirical test error"; rotation=π / 2, tellheight=false)
    Label(fig[1, 2:3, Top()], "calibrated model"; padding=(0, 0, 10, 0))
    Label(fig[2, 2:3, Top()], "uncalibrated model"; padding=(0, 0, 10, 0))

    # create axes to plot test error vs number of samples
    ax1 = Axis(
        fig[1, 2];
        ylabel="type I error",
        xticks=2:2:10,
        xtickformat=logtickformat(2),
        xticklabelsize=12,
        yticklabelsize=12,
    )
    ax2 = Axis(
        fig[2, 2];
        xlabel="# samples",
        ylabel="type II error",
        xticks=2:2:10,
        xtickformat=logtickformat(2),
        xticklabelsize=12,
        yticklabelsize=12,
    )

    # create axes to plot test error vs timings
    ax3 = Axis(
        fig[1, 3]; xtickformat=logtickformat(10), xticklabelsize=12, yticklabelsize=12
    )
    ax4 = Axis(
        fig[2, 3];
        xlabel="time [s]",
        xtickformat=logtickformat(10),
        xticklabelsize=12,
        yticklabelsize=12,
    )

    # plot benchmark results
    tests = ["SKCE (B = 2)", "SKCE (B = √n)", "SKCE (B = n)", "CME"]
    markers = ['●', '■', '▲', '◆']
    for (i, (test, marker)) in enumerate(zip(tests, markers))
        color = Dark2_8[i]

        # for both calibrated and uncalibrated model
        for (axes, model) in
            zip(((ax1, ax3), (ax2, ax4)), (calibrated_model, uncalibrated_model))
            group = filter(x -> x.test == test && x.model == string(model), groups)

            # plot test error vs samples
            scatterlines!(
                axes[1],
                group.log2_nsamples,
                group.testerror;
                color=color,
                linewidth=2,
                marker=marker,
                markercolor=color,
            )

            # plot test error vs timings
            scatterlines!(
                axes[2],
                group.log10_mintime,
                group.testerror;
                color=color,
                linewidth=2,
                marker=marker,
                markercolor=color,
            )
        end
    end

    # plot horizontal lines for significance level
    for axis in (ax1, ax3)
        hlines!(axis, 0.05; color=:black, linestyle=:dash, linewidth=2)
    end

    # link axes and hide decorations
    linkxaxes!(ax1, ax2)
    hidexdecorations!(ax1)
    linkxaxes!(ax3, ax4)
    hidexdecorations!(ax3)
    linkyaxes!(ax1, ax3)
    hideydecorations!(ax3)
    linkyaxes!(ax2, ax4)
    hideydecorations!(ax4)

    # add legend
    elems = map(1:length(tests)) do i
        [
            LineElement(; color=Dark2_8[i], linestyle=nothing, linewidth=2),
            MarkerElement(; color=Dark2_8[i], marker=markers[i], strokecolor=:black),
        ]
    end
    push!(elems, [LineElement(; color=:black, linestyle=:dash, linewidth=2)])
    Legend(
        fig[1:2, end + 1],
        elems,
        vcat(tests, "significance level");
        tellwidth=true,
        gridshalign=:left,
    )

    return fig
end

plot_benchmark_tests(; dim=1)
wsavefig("figures/synthetic/tests_dim=1.svg");

plot_benchmark_tests(; dim=10)
wsavefig("figures/synthetic/tests_dim=10.svg");


This page was generated using Literate.jl.