Skip to content

Commit e13625a

Browse files
committed
statistical routines tutorial: advanced section for building confidence interval with toys with solution
1 parent af57d49 commit e13625a

File tree

2 files changed

+231
-0
lines changed

2 files changed

+231
-0
lines changed
Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
"""Plot the 1D likelihood scan together with the 68% quantile of the test
2+
statistic computed from toys at each scanned r value.
3+
4+
It should be run after producing the toy files for each r value and the likelihood scan file using the commands in the tutorial.
5+
For the former, the command is something like:
6+
7+
for r_val in $(seq -2 0.08 6); do
8+
combine -M MultiDimFit datacard.txt --rMin -10 --rMax 10 --algo fixed \
9+
--fixedPointPOIs r=${r_val} --setParameters r=${r_val} \
10+
-t 1000 --toysFrequentist -n ".r_${r_val}"
11+
done
12+
13+
The script:
14+
1. Loops over higgsCombine.r_<value>.MultiDimFit.mH120.123456.root files
15+
and computes the 0.68 quantile of 2*deltaNLL for each r value.
16+
2. Reads the likelihood scan from higgsCombineTest.MultiDimFit.mH120.root.
17+
3. Plots both curves and finds their crossings via interpolation.
18+
"""
19+
20+
import glob
21+
import os
22+
import re
23+
24+
import matplotlib.pyplot as plt
25+
import numpy as np
26+
import scipy.stats as st
27+
from scipy.interpolate import interp1d
28+
29+
from ROOT import TFile
30+
31+
32+
def get_quantile_for_file(filepath, n_sigma=1):
33+
"""Compute the test statistic cutoff at the given sigma level.
34+
35+
Follows the same logic as get_quantile.py.
36+
"""
37+
quantile_val = 2 * st.norm().cdf(-n_sigma)
38+
39+
f = TFile(filepath, "READ")
40+
limit = f.Get("limit")
41+
if not limit:
42+
return None
43+
44+
m2nll_vals = []
45+
for i in range(limit.GetEntries()):
46+
limit.GetEntry(i)
47+
if limit.quantileExpected < 0:
48+
continue
49+
m2nll_vals.append(2 * limit.deltaNLL)
50+
51+
f.Close()
52+
53+
if len(m2nll_vals) == 0:
54+
return None
55+
56+
return np.quantile(m2nll_vals, 1 - quantile_val)
57+
58+
59+
def read_scan(filepath, poi="r"):
60+
"""Read the likelihood scan from a MultiDimFit output file.
61+
62+
Returns sorted arrays of (poi_values, 2*deltaNLL).
63+
"""
64+
f = TFile(filepath, "READ")
65+
limit = f.Get("limit")
66+
67+
r_vals = []
68+
dnll_vals = []
69+
for i in range(limit.GetEntries()):
70+
limit.GetEntry(i)
71+
if limit.quantileExpected < -1.5:
72+
continue
73+
r_vals.append(getattr(limit, poi))
74+
dnll_vals.append(2 * limit.deltaNLL)
75+
76+
f.Close()
77+
78+
r_vals = np.array(r_vals)
79+
dnll_vals = np.array(dnll_vals)
80+
81+
order = np.argsort(r_vals)
82+
return r_vals[order], dnll_vals[order]
83+
84+
85+
def main():
86+
base_dir = os.path.dirname(os.path.abspath(__file__))
87+
88+
# --- Step 1: compute 0.68 quantile for each r value from toy files ---
89+
pattern = os.path.join(base_dir, "higgsCombine.r_*.MultiDimFit.mH120.123456.root")
90+
toy_files = sorted(glob.glob(pattern))
91+
92+
r_points = []
93+
quantile_points = []
94+
for fpath in toy_files:
95+
basename = os.path.basename(fpath)
96+
match = re.search(r"higgsCombine\.r_([-\d.eE+]+)\.MultiDimFit", basename)
97+
if not match:
98+
continue
99+
r_val = float(match.group(1))
100+
q = get_quantile_for_file(fpath, n_sigma=1)
101+
if q is not None:
102+
r_points.append(r_val)
103+
quantile_points.append(q)
104+
105+
r_points = np.array(r_points)
106+
quantile_points = np.array(quantile_points)
107+
108+
order = np.argsort(r_points)
109+
r_points = r_points[order]
110+
quantile_points = quantile_points[order]
111+
112+
print(f"Computed quantile for {len(r_points)} r values")
113+
114+
# --- Step 2: read the likelihood scan ---
115+
scan_file = os.path.join(base_dir, "higgsCombineTest.MultiDimFit.mH120.root")
116+
scan_r, scan_dnll = read_scan(scan_file)
117+
print(f"Read scan with {len(scan_r)} points")
118+
119+
# --- Step 3: interpolate the quantile curve ---
120+
quantile_interp = interp1d(r_points, quantile_points, kind="cubic")
121+
122+
# --- Step 4: find crossings ---
123+
# Restrict to the overlap region
124+
r_min = max(r_points.min(), scan_r.min())
125+
r_max = min(r_points.max(), scan_r.max())
126+
127+
# Interpolate both curves on a fine common grid
128+
r_fine = np.linspace(r_min, r_max, 5000)
129+
scan_interp = interp1d(scan_r, scan_dnll, kind="cubic")
130+
scan_fine = scan_interp(r_fine)
131+
quantile_fine = quantile_interp(r_fine)
132+
133+
diff = scan_fine - quantile_fine
134+
crossings = []
135+
for i in range(len(diff) - 1):
136+
if diff[i] * diff[i + 1] < 0:
137+
# Linear interpolation for the crossing
138+
r_cross = r_fine[i] - diff[i] * (r_fine[i + 1] - r_fine[i]) / (diff[i + 1] - diff[i])
139+
crossings.append(r_cross)
140+
141+
# Find the best-fit point (minimum of the scan)
142+
bestfit = scan_r[np.argmin(scan_dnll)]
143+
144+
# Sort crossings into lo/hi relative to best fit
145+
crossings_lo = sorted([c for c in crossings if c < bestfit])
146+
crossings_hi = sorted([c for c in crossings if c >= bestfit])
147+
lo = crossings_lo[-1] if crossings_lo else None
148+
hi = crossings_hi[0] if crossings_hi else None
149+
150+
err_lo = bestfit - lo if lo is not None else None
151+
err_hi = hi - bestfit if hi is not None else None
152+
153+
label_parts = [f"r = {bestfit:.3f}"]
154+
if err_hi is not None:
155+
label_parts.append(f"+{err_hi:.3f}")
156+
if err_lo is not None:
157+
label_parts.append(f"-{err_lo:.3f}")
158+
bestfit_label = " ".join(label_parts)
159+
160+
print(f"Best fit: {bestfit_label}")
161+
print(f"Found {len(crossings)} crossing(s):")
162+
for c in crossings:
163+
print(f" r = {c:.4f}")
164+
165+
# --- Step 5: plot ---
166+
fig, ax = plt.subplots(figsize=(8, 6))
167+
168+
ax.plot(
169+
scan_r,
170+
scan_dnll,
171+
"k-",
172+
linewidth=2,
173+
label="Likelihood scan ($2\\Delta\\mathrm{NLL}$)",
174+
)
175+
ax.plot(
176+
r_fine,
177+
quantile_fine,
178+
"-",
179+
color="0.4",
180+
linewidth=1.5,
181+
label="68% quantile (toys)",
182+
)
183+
ax.axhline(
184+
1.0,
185+
color="lightblue",
186+
linewidth=1.5,
187+
linestyle="-",
188+
label="$2\\Delta\\mathrm{NLL} = 1$",
189+
)
190+
191+
for c in crossings:
192+
y_cross = float(scan_interp(c))
193+
ax.axvline(c, color="gray", linestyle=":", alpha=0.7)
194+
ax.plot(c, y_cross, "o", color="red", markersize=8, zorder=5)
195+
196+
# Annotate best-fit value with uncertainties (stacked like plot1DScan)
197+
hi_str = f"+{err_hi:.3f}" if err_hi is not None else ""
198+
lo_str = f"-{err_lo:.3f}" if err_lo is not None else ""
199+
bestfit_text = f"$r = {bestfit:.3f}^{{{hi_str}}}_{{{lo_str}}}$"
200+
ax.text(
201+
0.80,
202+
0.95,
203+
bestfit_text,
204+
transform=ax.transAxes,
205+
fontsize=14,
206+
verticalalignment="top",
207+
horizontalalignment="right",
208+
bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8),
209+
)
210+
211+
ax.set_xlabel("r", fontsize=13)
212+
ax.set_ylabel("$2\\Delta\\mathrm{NLL}$", fontsize=13)
213+
ax.set_title("Likelihood scan vs toy-based 68% quantile")
214+
ax.legend(fontsize=11, loc="upper left")
215+
ax.set_ylim(bottom=0)
216+
217+
outpath = os.path.join(base_dir, "scan_vs_quantile.png")
218+
fig.savefig(outpath, dpi=150, bbox_inches="tight")
219+
print(f"Saved plot to {outpath}")
220+
221+
# Also save as pdf
222+
outpath_pdf = os.path.join(base_dir, "scan_vs_quantile.pdf")
223+
fig.savefig(outpath_pdf, bbox_inches="tight")
224+
print(f"Saved plot to {outpath_pdf}")
225+
226+
227+
if __name__ == "__main__":
228+
main()

docs/tutorial_stat_routines/stat_routines.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,9 @@ combine -M MultiDimFit datacard.txt --rMin -10 --rMax 10 --algo fixed --fixedPoi
204204
Test out a few values of `r` and see if they all give you the same result.
205205
What happens for `r` less than about -1? Can you explain why? (hint: look at the values in the datacard)
206206

207+
**Advanced**: if you run the command above for a set of points in the range [-2, 6] (the same one used in the scan before), you can build the line of critical values for $t_{\mu}$ and check where the crossing of the observed likelihood scan is in order to find the confidence interval. Try to do that and compare it to the one we derived earlier using Wilks' theorem.
208+
If you need help, take a look at the code in `plot_scan_with_quantile_solution.py`.
209+
207210

208211
## Significance Testing
209212

0 commit comments

Comments
 (0)