nnpdf/validphys2/src/validphys/covmats.py at b872fda985e3aa52e17e7b595ff216d6b29d00a7 · NNPDF/nnpdf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
"""Module for handling logic and manipulation of covariance and correlation
matrices on different levels of abstraction
"""
import logging

import numpy as np
import pandas as pd
import scipy.linalg as la

from reportengine import collect
from reportengine.table import table

from validphys.calcutils import regularize_covmat, get_df_block
from validphys.checks import (
    check_dataset_cuts_match_theorycovmat,
    check_norm_threshold,
    check_pdf_is_montecarlo,
    check_speclabels_different,
    check_data_cuts_match_theorycovmat,
    check_cuts_considered,
)
from validphys.convolution import central_predictions
from validphys.core import PDF, DataGroupSpec, DataSetSpec
from validphys.covmats_utils import construct_covmat, systematics_matrix
from validphys.results import ThPredictionsResult

log = logging.getLogger(__name__)

INTRA_DATASET_SYS_NAME = ("UNCORR", "CORR", "THEORYUNCORR", "THEORYCORR")


def covmat_from_systematics(
    loaded_commondata_with_cuts,
    dataset_input,
    use_weights_in_covmat=True,
    _central_values=None
):
    """Take the statistical uncertainty and systematics table from
    a :py:class:`validphys.coredata.CommonData` object and
    construct the covariance matrix accounting for correlations between
    systematics.

    If the systematic has the name ``SKIP`` then it is ignored in the
    construction of the covariance matrix.

    ADDitive or MULTiplicative systypes are handled by either multiplying
    the additive or multiplicative uncertainties respectively. We convert
    uncertainties so that they are all in the same units as the data:
        - Additive (ADD) systematics are left unchanged
        - multiplicative (MULT) systematics need to be converted from a
        percentage by multiplying by the central value
        and dividing by 100.

    Finally, the systematics are split into the five possible archetypes
    of systematic uncertainties: uncorrelated (UNCORR), correlated (CORR),
    theory uncorrelated (THEORYUNCORR), theory correlated (THEORYCORR) and
    special correlated (SPECIALCORR) systematics.

    Uncorrelated contributions from statistical error, uncorrelated and
    theory uncorrelated are added in quadrature to the diagonal of the covmat.

    The contribution to the covariance matrix arising due to
    correlated systematics is schematically ``A_correlated @ A_correlated.T``,
    where A_correlated is a matrix N_dat by N_sys. The total contribution
    from correlated systematics is found by adding together the result of
    mutiplying each correlated systematic matrix by its transpose
    (correlated, theory_correlated and special_correlated).

    For more information on the generation of the covariance matrix see the
    `paper <https://arxiv.org/pdf/hep-ph/0501067.pdf>`_
    outlining the procedure, specifically equation 2 and surrounding text.

    Parameters
    ----------

    loaded_commondata_with_cuts : validphys.coredata.CommonData
        CommonData which stores information about systematic errors,
        their treatment and description.
    dataset_input: validphys.core.DataSetInput
        Dataset settings, contains the weight for the current dataset.
        The returned covmat will be divided by the dataset weight if
        ``use_weights_in_covmat``. The default weight is 1, which means
        the returned covmat will be unmodified.
    use_weights_in_covmat: bool
        Whether to weight the covmat, True by default.
    _central_values : None, np.array
        1-D array containing alternative central values to combine with the
        multiplicative errors to calculate their absolute contributions. By
        default this is None, and the experimental central values are used. However, this
        can be used to calculate, for example, the t0 covariance matrix by
        using the predictions from the central member of the t0 pdf.

    Returns
    -------
    cov_mat: np.array
        Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts)
        containing uncertainty and correlation information.

    Example
    -------
    In order to use this function, simply call it from the API

    >>> from validphys.api import API
    >>> inp = dict(
    ...     dataset_input={'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10},
    ...     theoryid=162,
    ...     use_cuts="internal"
    ... )
    >>> cov = API.covmat_from_systematics(**inp)
    >>> cov.shape
    (28, 28)

    """
    covmat = construct_covmat(
        loaded_commondata_with_cuts.stat_errors.to_numpy(),
        loaded_commondata_with_cuts.systematic_errors(_central_values)
    )
    if use_weights_in_covmat:
        return covmat / dataset_input.weight
    return covmat


def dataset_inputs_covmat_from_systematics(
    dataset_inputs_loaded_cd_with_cuts,
    data_input,
    use_weights_in_covmat=True,
    _list_of_central_values=None
):
    """Given a list containing :py:class:`validphys.coredata.CommonData` s,
    construct the full covariance matrix.

    This is similar to :py:meth:`covmat_from_systematics`
    except that special corr systematics are concatenated across all datasets
    before being multiplied by their transpose to give off block-diagonal
    contributions. The other systematics contribute to the block diagonal in the
    same way as :py:meth:`covmat_from_systematics`.

    Parameters
    ----------
    dataset_inputs_loaded_cd_with_cuts : list[validphys.coredata.CommonData]
        list of CommonData objects.
    data_input: list[validphys.core.DataSetInput]
        Settings for each dataset, each element contains the weight for the
        current dataset. The elements of the returned covmat for dataset
        i and j will be divided by sqrt(weight_i)*sqrt(weight_j), if
        ``use_weights_in_covmat``. The default weight is 1, which means
        the returned covmat will be unmodified.
    use_weights_in_covmat: bool
        Whether to weight the covmat, True by default.
    _list_of_central_values: None, list[np.array]
        list of 1-D arrays which contain alternative central values which are
        combined with the multiplicative errors to calculate their absolute
        contribution. By default this is None and the experimental central
        values are used.

    Returns
    -------
    cov_mat : np.array
        Numpy array which is N_dat x N_dat (where N_dat is the number of data points after cuts)
        containing uncertainty and correlation information.

    Example
    -------
    This function can be called directly from the API:

    >>> dsinps = [
    ...     {'dataset': 'NMC'},
    ...     {'dataset': 'ATLASTTBARTOT', 'cfac':['QCD']},
    ...     {'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10}
    ... ]
    >>> inp = dict(dataset_inputs=dsinps, theoryid=162, use_cuts="internal")
    >>> cov = API.dataset_inputs_covmat_from_systematics(**inp)
    >>> cov.shape
    (235, 235)

    Which properly accounts for all dataset settings and cuts.

    """
    special_corrs = []
    block_diags = []
    weights = []

    if _list_of_central_values is None:
        # want to just pass None to systematic_errors method
        _list_of_central_values = [None] * len(dataset_inputs_loaded_cd_with_cuts)

    for cd, dsinp, central_values in zip(
        dataset_inputs_loaded_cd_with_cuts,
        data_input,
        _list_of_central_values
    ):
        sys_errors = cd.systematic_errors(central_values)
        stat_errors = cd.stat_errors.to_numpy()
        weights.append(np.full_like(stat_errors, dsinp.weight))
        # separate out the special uncertainties which can be correlated across
        # datasets
        is_intra_dataset_error = sys_errors.columns.isin(INTRA_DATASET_SYS_NAME)
        block_diags.append(construct_covmat(
            stat_errors, sys_errors.loc[:, is_intra_dataset_error]))
        special_corrs.append(sys_errors.loc[:, ~is_intra_dataset_error])

    # concat systematics across datasets
    special_sys = pd.concat(special_corrs, axis=0, sort=False)
    # non-overlapping systematics are set to NaN by concat, fill with 0 instead.
    special_sys.fillna(0, inplace=True)

    diag = la.block_diag(*block_diags)
    covmat = diag + special_sys.to_numpy() @ special_sys.to_numpy().T
    if use_weights_in_covmat:
        # concatenate weights and sqrt
        sqrt_weights = np.sqrt(np.concatenate(weights))
        # returns C_ij / (sqrt(w_i) * sqrt(w_j))
        return (covmat / sqrt_weights).T / sqrt_weights
    return covmat


@check_cuts_considered
def dataset_t0_predictions(dataset, t0set):
    """Returns the t0 predictions for a ``dataset`` which are the predictions
    calculated using the central member of ``pdf``. Note that if ``pdf`` has
    errortype ``replicas``, and the dataset is a hadronic observable then the
    predictions of the central member are subtly different to the central
    value of the replica predictions.

    Parameters
    ----------
    dataset: validphys.core.DataSetSpec
        dataset for which to calculate t0 predictions
    t0set: validphys.core.PDF
        pdf used to calculate the predictions

    Returns
    -------
    t0_predictions: np.array
        1-D numpy array with predictions for each of the cut datapoints.

    """
    # reshape because the underlying data has shape ndata * 1
    # accounting for the fact that some datasets are single datapoint
    return central_predictions(dataset, t0set).to_numpy().reshape(-1)


def t0_covmat_from_systematics(
    loaded_commondata_with_cuts,
    *,
    dataset_input,
    use_weights_in_covmat=True,
    dataset_t0_predictions
):
    """Like :py:func:`covmat_from_systematics` except uses the t0 predictions
    to calculate the absolute constributions to the covmat from multiplicative
    uncertainties. For more info on the t0 predictions see
    :py:func:`validphys.commondata.dataset_t0_predictions`.

    Parameters
    ----------
    loaded_commondata_with_cuts: validphys.coredata.CommonData
        commondata object for which to generate the covmat.
    dataset_input: validphys.core.DataSetInput
        Dataset settings, contains the weight for the current dataset.
        The returned covmat will be divided by the dataset weight if
        ``use_weights_in_covmat``. The default weight is 1, which means
        the returned covmat will be unmodified.
    use_weights_in_covmat: bool
        Whether to weight the covmat, True by default.
    dataset_t0_predictions: np.array
        1-D array with t0 predictions.

    Returns
    -------
    t0_covmat: np.array
        t0 covariance matrix

    """
    return covmat_from_systematics(
        loaded_commondata_with_cuts,
        dataset_input,
        use_weights_in_covmat,
        _central_values=dataset_t0_predictions
    )


dataset_inputs_t0_predictions = collect("dataset_t0_predictions", ("data",))


def dataset_inputs_t0_covmat_from_systematics(
    dataset_inputs_loaded_cd_with_cuts,
    *,
    data_input,
    use_weights_in_covmat=True,
    dataset_inputs_t0_predictions
):
    """Like :py:func:`t0_covmat_from_systematics` except for all data

    Parameters
    ----------
    dataset_inputs_loaded_cd_with_cuts: list[validphys.coredata.CommonData]
        The CommonData for all datasets defined in ``dataset_inputs``.
    data_input: list[validphys.core.DataSetInput]
        Settings for each dataset, each element contains the weight for the
        current dataset. The elements of the returned covmat for dataset
        i and j will be divided by sqrt(weight_i)*sqrt(weight_j), if
        ``use_weights_in_covmat``. The default weight is 1, which means
        the returned covmat will be unmodified.
    use_weights_in_covmat: bool
        Whether to weight the covmat, True by default.
    dataset_inputs_t0_predictions: list[np.array]
        The t0 predictions for all datasets.

    Returns
    -------
    t0_covmat: np.array
        t0 covariance matrix matrix for list of datasets.
    """
    return dataset_inputs_covmat_from_systematics(
        dataset_inputs_loaded_cd_with_cuts,
        data_input,
        use_weights_in_covmat,
        _list_of_central_values=dataset_inputs_t0_predictions
    )


def sqrt_covmat(covariance_matrix):
    """Function that computes the square root of the covariance matrix.

    Parameters
    ----------
    covariance_matrix : np.array
        A positive definite covariance matrix, which is N_dat x N_dat (where
        N_dat is the number of data points after cuts) containing uncertainty
        and correlation information.

    Returns
    -------
    sqrt_mat : np.array
        The square root of the input covariance matrix, which is N_dat x N_dat
        (where N_dat is the number of data points after cuts), and which is the
        the lower triangular decomposition. The following should be ``True``:
        ``np.allclose(sqrt_covmat @ sqrt_covmat.T, covariance_matrix)``.

    Notes
    -----
    The square root is found by using the Cholesky decomposition. However, rather
    than finding the decomposition of the covariance matrix directly, the (upper
    triangular) decomposition is found of the corresponding correlation matrix
    and then the output of this is rescaled and then transposed as
    ``sqrt_matrix = (decomp * sqrt_diags).T``, where ``decomp`` is the Cholesky
    decomposition of the correlation matrix and ``sqrt_diags`` is the square root
    of the diagonal entries of the covariance matrix. This method is useful in
    situations in which the covariance matrix is near-singular. See
    `here <https://www.gnu.org/software/gsl/doc/html/linalg.html#cholesky-decomposition>`_
    for more discussion on this.

    The lower triangular is useful for efficient calculation of the :math:`\chi^2`

    Example
    -------
    >>> import numpy as np
    >>> from validphys.api import API
    >>> API.sqrt_covmat(dataset_input={"dataset":"NMC"}, theoryid=162, use_cuts="internal")
    array([[0.0326543 , 0.        , 0.        , ..., 0.        , 0.        ,
            0.        ],
        [0.00314523, 0.01467259, 0.        , ..., 0.        , 0.        ,
            0.        ],
        [0.0037817 , 0.00544256, 0.02874822, ..., 0.        , 0.        ,
            0.        ],
        ...,
        [0.00043404, 0.00031169, 0.00020489, ..., 0.00441073, 0.        ,
            0.        ],
        [0.00048717, 0.00033792, 0.00022971, ..., 0.00126704, 0.00435696,
            0.        ],
        [0.00067353, 0.00050372, 0.0003203 , ..., 0.00107255, 0.00065041,
            0.01002952]])
    >>> sqrt_cov = API.sqrt_covmat(dataset_input={"dataset":"NMC"}, theoryid=162, use_cuts="internal")
    >>> cov = API.covariance_matrix(dataset_input={"dataset":"NMC"}, theoryid=162, use_cuts="internal")
    >>> np.allclose(np.linalg.cholesky(cov), sqrt_cov)
    True

    """
    dimensions = covariance_matrix.shape

    if covariance_matrix.size == 0:
        raise ValueError("Attempting the decomposition of an empty matrix.")
    elif dimensions[0] != dimensions[1]:
        raise ValueError("The input covariance matrix should be square but "
                         f"instead it has dimensions {dimensions[0]} x "
                         f"{dimensions[1]}")

    sqrt_diags = np.sqrt(np.diag(covariance_matrix))
    correlation_matrix = covariance_matrix / sqrt_diags[:, np.newaxis] / sqrt_diags
    decomp = la.cholesky(correlation_matrix)
    sqrt_matrix = (decomp * sqrt_diags).T
    return sqrt_matrix


def groups_covmat_no_table(
       groups_data, groups_index, groups_covmat_collection):
    """Export the covariance matrix for the groups. It exports the full
    (symmetric) matrix, with the 3 first rows and columns being:

        - group name

        - dataset name

        - index of the point within the dataset.
    """
    data = np.zeros((len(groups_index),len(groups_index)))
    df = pd.DataFrame(data, index=groups_index, columns=groups_index)
    for group, group_covmat in zip(
            groups_data, groups_covmat_collection):
        name = group.name
        df.loc[[name],[name]] = group_covmat
    return df


@table
def groups_covmat(groups_covmat_no_table):
    """Duplicate of groups_covmat_no_table but with a table decorator."""
    return groups_covmat_no_table


@table
def groups_sqrtcovmat(
        groups_data, groups_index, groups_sqrt_covmat):
    """Like groups_covmat, but dump the lower triangular part of the
    Cholesky decomposition as used in the fit. The upper part indices are set
    to zero.
    """
    data = np.zeros((len(groups_index),len(groups_index)))
    df = pd.DataFrame(data, index=groups_index, columns=groups_index)
    for group, group_sqrt_covmat in zip(
            groups_data, groups_sqrt_covmat):
        name = group.name
        group_sqrt_covmat[np.triu_indices_from(group_sqrt_covmat, k=1)] = 0
        df.loc[[name],[name]] = group_sqrt_covmat
    return df


@table
def groups_invcovmat(
        groups_data, groups_index, groups_covmat_collection):
    """Compute and export the inverse covariance matrix.
    Note that this inverts the matrices with the LU method which is
    suboptimal."""
    data = np.zeros((len(groups_index),len(groups_index)))
    df = pd.DataFrame(data, index=groups_index, columns=groups_index)
    for group, group_covmat in zip(
            groups_data, groups_covmat_collection):
        name = group.name
        #Improve this inversion if this method tuns out to be important
        invcov = la.inv(group_covmat)
        df.loc[[name],[name]] = invcov
    return df


@table
def groups_normcovmat(groups_covmat, groups_data_values):
    """Calculates the grouped experimental covariance matrix normalised to data."""
    df = groups_covmat
    groups_data_array = np.array(groups_data_values)
    mat = df/np.outer(groups_data_array, groups_data_array)
    return mat


@table
def groups_corrmat(groups_covmat):
    """Generates the grouped experimental correlation matrix with groups_covmat as input"""
    df = groups_covmat
    covmat = df.values
    diag_minus_half = (np.diagonal(covmat))**(-0.5)
    mat = diag_minus_half[:,np.newaxis]*df*diag_minus_half
    return mat


@check_data_cuts_match_theorycovmat
def dataset_inputs_covmat(
        data: DataGroupSpec,
        fitthcovmat,
        t0set:(PDF, type(None)) = None,
        use_weights_in_covmat: bool = True,
        norm_threshold=None):
    """Like `covmat` except for a group of datasets"""
    if not use_weights_in_covmat:
        data = data.to_unweighted()

    loaded_data = data.load()

    if t0set:
        #Copy data to avoid chaos
        loaded_data = type(loaded_data)(loaded_data)
        log.debug("Setting T0 predictions for %s" % data)
        loaded_data.SetT0(t0set.load_t0().as_libNNPDF())

    covmat = loaded_data.get_covmat()

    if fitthcovmat:
        loaded_thcov = fitthcovmat.load()
        ds_names = loaded_thcov.index.get_level_values(1)
        indices = np.in1d(ds_names, [ds.name for ds in data.datasets]).nonzero()[0]
        covmat += loaded_thcov.iloc[indices, indices].values
    if norm_threshold is not None:
        covmat = regularize_covmat(
            covmat,
            norm_threshold=norm_threshold
        )
    return covmat


@check_dataset_cuts_match_theorycovmat
def covmat(
    dataset:DataSetSpec,
    fitthcovmat,
    t0set:(PDF, type(None)) = None,
    use_weights_in_covmat: bool = True,
    norm_threshold=None,
    ):
    """Returns the covariance matrix for a given `dataset`. By default the
    data central values will be used to calculate the multiplicative contributions
    to the covariance matrix.

    The matrix can instead be constructed with
    the t0 proceedure if the user sets `use_t0` to True and gives a
    `t0pdfset`. In this case the central predictions from the `t0pdfset` will be
    used to calculate the multiplicative contributions to the covariance matrix.
    More information on the t0 procedure can be found here:
    https://arxiv.org/abs/0912.2276

    The user can specify `use_fit_thcovmat_if_present` to be True
    and provide a corresponding `fit`. If the theory covmat was used in the
    corresponding `fit` and the specfied `dataset` was used in the fit then
    the theory covariance matrix for this `dataset` will be added in quadrature
    to the experimental covariance matrix.

    Covariance matrix can be regularized according to
    `calcutils.regularize_covmat` if the user specifies `norm_threshold. This
    algorithm sets a minimum threshold for eigenvalues that the corresponding
    correlation matrix can have to be:

    1/(norm_threshold)^2

    which has the effect of limiting the L2 norm of the inverse of the correlation
    matrix. By default norm_threshold is None, to which means no regularization
    is performed.

    Parameters
    ----------
    dataset : DataSetSpec
        object parsed from the `dataset_input` runcard key
    fitthcovmat: None or ThCovMatSpec
        None if either `use_thcovmat_if_present` is False or if no theory
        covariance matrix was used in the corresponding fit
    t0set: None or PDF
        None if `use_t0` is False or a PDF parsed from `t0pdfset` runcard key
    use_weights_in_covmat: bool, default True
        Rescale the covmat by the dataset weights.
    norm_threshold: number
        threshold used to regularize covariance matrix

    Returns
    -------
    covmat : array
        a covariance matrix as a numpy array

    Examples
    --------

    >>> from validphys.api import API
    >>> inp = {
            'dataset_input': {'ATLASTTBARTOT'},
            'theoryid': 52,
            'use_cuts': 'no_cuts'
        }
    >>> cov = API.covmat(**inp)
    TODO: complete example
    """
    if not use_weights_in_covmat:
        dataset = dataset.to_unweighted()
    loaded_data = dataset.load()

    if t0set:
        #Copy data to avoid chaos
        loaded_data = type(loaded_data)(loaded_data)
        log.debug("Setting T0 predictions for %s" % dataset)
        loaded_data.SetT0(t0set.load_t0().as_libNNPDF())

    covmat = loaded_data.get_covmat()
    if fitthcovmat:
        loaded_thcov = fitthcovmat.load()
        covmat += get_df_block(loaded_thcov, dataset.name, level=1)
    if norm_threshold is not None:
        covmat = regularize_covmat(
            covmat,
            norm_threshold=norm_threshold
        )
    return covmat


@check_pdf_is_montecarlo
def pdferr_plus_covmat(dataset, pdf, covmat):
    """For a given `dataset`, returns the sum of the covariance matrix given by
    `covmat` and the PDF error: a covariance matrix estimated from the
    replica theory predictions from a given monte carlo `pdf`

    Parameters
    ----------
    dataset: DataSetSpec
        object parsed from the `dataset_input` runcard key
    pdf: PDF
        monte carlo pdf used to estimate PDF error
    covmat: np.array
        experimental covariance matrix

    Returns
    -------
    covariance_matrix: np.array
        sum of the experimental and pdf error as a numpy array

    Examples
    --------

    `use_pdferr` makes this action be used for `covariance_matrix`

    >>> from validphys.api import API
    >>> from import numpy as np
    >>> inp = {
            'dataset_input': {'dataset' : 'ATLASTTBARTOT'},
            'theoryid': 53,
            'pdf': 'NNPDF31_nlo_as_0118',
            'use_cuts': 'nocuts'
        }
    >>> a = API.covariance_matrix(**inp, use_pdferr=True)
    >>> b = API.pdferr_plus_covmat(**inp)
    >>> np.allclose(a == b)
    True

    See Also
    --------
    covmat: Standard experimental covariance matrix
    """
    th = ThPredictionsResult.from_convolution(pdf, dataset)
    pdf_cov = np.cov(th._rawdata, rowvar=True)
    return pdf_cov + covmat


def pdferr_plus_dataset_inputs_covmat(data, pdf, dataset_inputs_covmat):
    """Like `pdferr_plus_covmat` except for an experiment"""
    # do checks get performed here?
    return pdferr_plus_covmat(data, pdf, dataset_inputs_covmat)


def dataset_inputs_sqrt_covmat(dataset_inputs_covariance_matrix):
    """Like `sqrt_covmat` but for an group of datasets"""
    return sqrt_covmat(dataset_inputs_covariance_matrix)


def systematics_matrix_from_commondata(
    loaded_commondata_with_cuts,
    dataset_input,
    use_weights_in_covmat=True,
    _central_values=None
):
    """Returns a systematics matrix, :math:`A`, for the corresponding dataset.
    The systematics matrix is a square root of the covmat:

    .. math::

        C = A A^T

    and is obtained by concatenating a block diagonal of the uncorrelated uncertainties
    with the correlated systematics.

    """
    sqrt_covmat = systematics_matrix(
        loaded_commondata_with_cuts.stat_errors.to_numpy(),
        loaded_commondata_with_cuts.systematic_errors(_central_values)
    )
    if use_weights_in_covmat:
        return sqrt_covmat / np.sqrt(dataset_input.weight)
    return sqrt_covmat

def covmat_stability_characteristic(systematics_matrix_from_commondata):
    """
    Return a number characterizing the stability of an experimental covariance
    matrix against uncertainties in the correlation. It is defined as the L2
    norm (largest singular value) of the square root of the inverse correlation
    matrix. This is equivalent to the square root of the inverse of the
    smallest singular value of the correlation matrix:

    Z = (1/λ⁰)^½

    Where λ⁰ is the smallest eigenvalue of the correlation matrix.

    This is the number used as
    threshold in :py:func:`calcutils.regularize_covmat`. The interpretation
    is roughly what precision does the worst correlation need to
    have in order to not affect meaningfully the χ² computed using the
    covariance matrix, so for example a stability characteristic of 4 means
    that correlations need to be known with uncetainties less than 0.25.

    Examples
    --------

    >>> from validphys.api import API
    >>> API.covmat_stability_characteristic(dataset_input={"dataset": "NMC"},
    ... theoryid=162, use_cuts="internal")
    2.742658604186114

    """
    sqrtcov = systematics_matrix_from_commondata
    # copied from calcutils.regularize_l2 but just return stability condition.
    d = np.sqrt(np.sum(sqrtcov ** 2, axis=1))[:, np.newaxis]
    sqrtcorr = sqrtcov / d
    _, s, _ = la.svd(sqrtcorr, full_matrices=False)
    return 1 / s[-1]


dataset_inputs_stability = collect('covmat_stability_characteristic', ('dataset_inputs',))


@table
def dataset_inputs_stability_table(dataset_inputs_stability, dataset_inputs):
    """Return a table with py:func:`covmat_stability_characteristic` for all
    dataset inputs"""
    res = {}
    for ds, stab in zip(dataset_inputs, dataset_inputs_stability):
        res[ds.name] = stab

    return pd.Series(res, name="stability").sort_values()


def fit_name_with_covmat_label(fit, fitthcovmat):
    """If theory covariance matrix is being used to calculate statistical estimators for the `fit`
    then appends (exp + th) onto the fit name for use in legends and column headers to help the user
    see what covariance matrix was used to produce the plot or table they are looking at.
    """
    if fitthcovmat:
        label = str(fit) + " (exp + th)"
    else:
        label = str(fit)
    return label


@table
@check_norm_threshold
def datasets_covmat_differences_table(
    each_dataset, datasets_covmat_no_reg, datasets_covmat_reg, norm_threshold):
    """For each dataset calculate and tabulate two max differences upon
    regularization given a value for `norm_threshold`:

    - max relative difference to the diagonal of the covariance matrix (%)
    - max absolute difference to the correlation matrix of each covmat

    """
    records = []
    for ds, reg, noreg in zip(
        each_dataset, datasets_covmat_reg, datasets_covmat_no_reg):
        cov_diag_rel_diff = np.diag(reg)/np.diag(noreg)
        d_reg = np.sqrt(np.diag(reg))
        d_noreg = np.sqrt(np.diag(noreg))
        corr_reg = reg/d_reg[:, np.newaxis]/d_reg[np.newaxis, :]
        corr_noreg = noreg/d_noreg[:, np.newaxis]/d_noreg[np.newaxis, :]
        corr_abs_diff = abs(corr_reg - corr_noreg)
        records.append(dict(
                dataset=str(ds),
                covdiff= np.max(abs(cov_diag_rel_diff- 1))*100, #make percentage
                corrdiff=np.max(corr_abs_diff)
            ))
    df = pd.DataFrame.from_records(records,
        columns=("dataset", "covdiff", "corrdiff"),
        index = ("dataset",)
        )
    df.columns = ["Variance rel. diff. (%)", "Correlation max abs. diff."]
    return df


@check_speclabels_different
@table
def dataspecs_datasets_covmat_differences_table(
    dataspecs_speclabel, dataspecs_covmat_diff_tables
):
    """For each dataspec calculate and tabulate the two covmat differences
    described in `datasets_covmat_differences_table`
    (max relative difference in variance and max absolute correlation difference)

    """
    df = pd.concat( dataspecs_covmat_diff_tables, axis=1)
    cols = df.columns.get_level_values(0).unique()
    df.columns = pd.MultiIndex.from_product((dataspecs_speclabel, cols))
    return df


groups_covmat_collection = collect(
    'dataset_inputs_covariance_matrix', ('group_dataset_inputs_by_metadata',)
)

groups_sqrt_covmat = collect(
    'dataset_inputs_sqrt_covmat',
    ('group_dataset_inputs_by_metadata',)
)

dataspecs_covmat_diff_tables = collect(
    "datasets_covmat_differences_table", ("dataspecs",)
)

fits_name_with_covmat_label = collect('fit_name_with_covmat_label', ('fits',))

datasets_covmat_no_reg = collect(
    "covariance_matrix", ("data", "no_covmat_reg"))

datasets_covmat_reg = collect(
    "covariance_matrix", ("data",))

datasets_covmat = collect('covariance_matrix', ('data',))

datasets_covariance_matrix = collect(
    'covariance_matrix',
    ('experiments', 'experiment',)
)