-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathmultihist.py
More file actions
1006 lines (838 loc) · 39 KB
/
multihist.py
File metadata and controls
1006 lines (838 loc) · 39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from __future__ import division
from copy import deepcopy
from functools import reduce
try:
from itertools import izip as zip
except ImportError:
# Hello, python 3!
pass
import numpy as np
try:
from scipy.ndimage import zoom
from scipy import stats
HAVE_SCIPY = True
except ImportError:
HAVE_SCIPY = False
try:
import matplotlib
import matplotlib.pyplot as plt
CAN_PLOT = True
except ImportError:
plt = None
CAN_PLOT = False
COLUMNAR_DATA_SOURCES = []
try:
import dask
import dask.dataframe
import dask.multiprocessing
WE_HAVE_DASK = True
DEFAULT_DASK_COMPUTE_KWARGS = dict(get=dask.multiprocessing.get)
COLUMNAR_DATA_SOURCES.append(dask.dataframe.DataFrame)
except Exception: # Sometimes dask import succeeds, but throws error when starting up
WE_HAVE_DASK = False
pass
try:
import pandas as pd
COLUMNAR_DATA_SOURCES.append(pd.DataFrame)
except ImportError:
pass
COLUMNAR_DATA_SOURCES = tuple(COLUMNAR_DATA_SOURCES)
from operator import itemgetter
__version__ = '0.6.6'
class CoordinateOutOfRangeException(Exception):
pass
class MultiHistBase(object):
def similar_blank_hist(self):
newhist = deepcopy(self)
newhist.histogram = np.zeros_like(self.histogram)
return newhist
@property
def n(self):
"""Returns number of data points loaded into histogram"""
return np.sum(self.histogram)
# Overload binary numeric operators to work on histogram
# TODO: logical operators
def __getitem__(self, item):
return self.histogram[item]
def __setitem__(self, key, value):
self.histogram[key] = value
# Let unary operators work on wrapped histogram:
def min(self):
return self.histogram.min()
def max(self):
return self.histogram.max()
def __len__(self):
return len(self.histogram)
def __neg__(self):
return self.__class__.from_histogram(-self.histogram, self.bin_edges, self.axis_names)
def __pos__(self):
return self.__class__.from_histogram(+self.histogram, self.bin_edges, self.axis_names)
def __abs__(self):
return self.__class__.from_histogram(abs(self.histogram), self.bin_edges, self.axis_names)
def __invert__(self):
return self.__class__.from_histogram(~self.histogram, self.bin_edges, self.axis_names)
# Let binary operators work on wrapped histogram
@classmethod
def _make_binop(cls, opname):
def binop(self, other):
return self.__class__.from_histogram(
getattr(self.histogram, opname)(other),
self.bin_edges,
self.axis_names)
return binop
for methodname in 'add sub mul div truediv floordiv mod divmod pow lshift rshift and or'.split():
dundername = '__%s__' % methodname
setattr(MultiHistBase,
dundername,
MultiHistBase._make_binop(dundername))
setattr(MultiHistBase,
'__r%s__' % methodname,
getattr(MultiHistBase, dundername))
# Verbose alias
MultiHistBase.similar_blank_histogram = MultiHistBase.similar_blank_hist
class Hist1d(MultiHistBase):
axis_names = None
dimensions = 1
@classmethod
def from_histogram(cls, histogram, bin_edges, axis_names=None):
"""Make a Hist1D from a numpy bin_edges + histogram pair
:param histogram: Initial histogram
:param bin_edges: Bin edges of histogram. Must be one longer than length of histogram
:param axis_names: Ignored. Sorry :-)
:return:
"""
if len(bin_edges) != len(histogram) + 1:
raise ValueError("Bin edges must be of length %d, you gave %d!" % (len(histogram) + 1, len(bin_edges)))
self = cls(bins=bin_edges)
self.histogram = np.array(histogram)
return self
def __init__(self, data=None, bins=10, range=None, weights=None):
"""
:param data: Initial data to histogram.
:param bins: Number of bins, or list of bin edges (like np.histogram)
:param weights: Weights for initial data.
:param range: Range of histogram.
:return: None
"""
if data is None:
data = []
self.histogram, self.bin_edges = np.histogram(data, bins=bins, range=range, weights=weights)
def add(self, data, weights=None):
hist, _ = np.histogram(data, self.bin_edges, weights=weights)
self.histogram += hist
@property
def bin_centers(self):
return 0.5 * (self.bin_edges[1:] + self.bin_edges[:-1])
def bin_volumes(self):
return np.diff(self.bin_edges)
@property
def density(self):
"""Gives emprical PDF, like np.histogram(...., density=True)"""
h = self.histogram.astype(float)
bindifs = np.array(np.diff(self.bin_edges), float)
return h / (bindifs * self.n)
@property
def normalized_histogram(self):
"""Gives histogram with sum of entries normalized to 1."""
return self.histogram / self.n
@property
def cumulative_histogram(self):
return np.cumsum(self.histogram)
@property
def cumulative_density(self):
cs = np.cumsum(self.histogram)
return cs / cs[-1]
def get_random(self, size=10):
"""Returns random variates from the histogram.
Note this assumes the histogram is an 'events per bin', not a pdf.
Inside the bins, a uniform distribution is assumed.
"""
bin_i = np.random.choice(np.arange(len(self.bin_centers)), size=size, p=self.normalized_histogram)
return self.bin_centers[bin_i] + np.random.uniform(-0.5, 0.5, size=size) * self.bin_volumes()[bin_i]
def items(self):
"""Iterate over (bin_center, hist_value) from left to right"""
return zip(self.bin_centers, self.histogram)
@property
def mean(self):
"""Estimates mean of underlying data, assuming each datapoint was exactly in the center of its bin."""
return np.average(self.bin_centers, weights=self.histogram)
@property
def std(self, bessel_correction=True):
"""Estimates std of underlying data, assuming each datapoint was exactly in the center of its bin."""
if bessel_correction:
n = self.n
bc = n / (n - 1)
else:
bc = 1
return np.sqrt(np.average((self.bin_centers - self.mean) ** 2, weights=self.histogram)) * bc
def get_axis_bin_index(self, value):
"""Returns index along axis of bin in histogram which contains value
Inclusive on both endpoints
"""
return _find_axis_bin_index(value, self.bin_edges, 0)
##
# Data reduction: sum, slice, project, ...
##
def sum(self):
"""Returns a single-bin Hist1d containing the sum of the histogram.
(For compatibility with Histdd.sum. To get the numerical value, use Hist1d.n.)
"""
return Hist1d.from_histogram(
np.array([self.histogram.sum()]),
bin_edges=np.array([self.bin_edges[0], self.bin_edges[-1]]),
)
def slice(self, start, stop=None, axis=None):
"""Return another Hist1d restricted to bins whose values (not bin numbers)
are between start and stop along axis (both inclusive).
(The axis argument is not used but present for compatibility with Histdd.slice.)
"""
if stop is None:
# Make a 1=bin slice
stop = start
start_bin = max(0, self.get_axis_bin_index(start))
stop_bin = min(len(self.bin_centers) - 1, # TODO: test off by one!
self.get_axis_bin_index(stop))
new_bin_edges = self.bin_edges.copy()[start_bin:stop_bin + 2] # TODO: Test off by one here!
return Hist1d.from_histogram(
np.take(self.histogram, np.arange(start_bin, stop_bin + 1)),
bin_edges=new_bin_edges,
)
def slicesum(self, start, stop=None, axis=None):
"""Slices the histogram, then returns a single-bin Hist1d.
(For compatibility with Histdd slicesum. The axis argument is not used.
To get a number instead of a single-bin Hist1d, use .slice(...).n)
"""
return self.slice(start, stop).sum()
def data_for_plot(
self,
normed=False, scale_histogram_by=1.0, scale_errors_by=1.0,
errors=False, error_style='bar'):
"""Return (x, y, ylow, yhigh) of data used for plotting.
Specifically, returns:
x: x-values of lines/points; bin centers or edges, depending on error_style
y: y-values of lines/points, same length as x
ylow: lower bound of error bars/bands, same length as x
yhigh: upper bound of error bars/bands, same length as x
Arguments are as described in plot
"""
y = self.histogram
if normed:
scale_histogram_by /= y.sum()
if errors == 'sqrtn':
_yerr = y**0.5
ylow, yhigh = y - _yerr, y + _yerr
elif errors == 'central':
ylow, yhigh = poisson_1s_interval(y, fc=False)
elif errors == 'model_quantiles':
ylow, yhigh = stats.poisson(y).ppf(stats.norm.cdf(-1)), stats.poisson(y).ppf(stats.norm.cdf(1))
# Show some error band <0.2 expected events
yhigh = yhigh.clip(y, None)
elif errors:
ylow, yhigh = poisson_1s_interval(y, fc=True)
else:
ylow, yhigh = y, y
y = y.astype(float) * scale_histogram_by
ylow = ylow.astype(float) * scale_histogram_by
yhigh = yhigh.astype(float) * scale_histogram_by
ylow = (y - (y - ylow) * scale_errors_by).clip(0, None)
yhigh = y + (yhigh - y) * scale_errors_by
if errors and error_style == 'bar':
# Plotting (dots with) bars
x = self.bin_centers
else:
# Plot some kind of stepped line: we will plot vs the bin *edges*
# using steps-pre (not steps-mid)
# If we would have plotted values only vs the centers
# * the steps won't be correct for log scales
# * the final bins will not fully show
x = self.bin_edges
y, ylow, yhigh = [_repeat_first(q) for q in [y, ylow, yhigh]]
return x, y, ylow, yhigh
def plot(self,
normed=False, scale_histogram_by=1.0, scale_errors_by=1.0,
errors=False, error_style='bar', error_alpha=0.3,
plt=plt, set_xlim=False,
**kwargs):
"""Plot the histogram, with error bars if desired.
:param normed: Scale the histogram so the sum is 1 before plotting.
Errors are computed before scaling, then scaled accordingly.
:param scale_histogram_by: Custom multiplier to apply to histogram.
Errors are computed before scaling, then scaled accordingly.
:param scale_errors_by: Custom multiplier to apply to errors.
This scales the differences between the low/high errors and the
mean. For low counts / asymmetric errors, this will look strange.
:param errors: Whether and how to plot 1 sigma error bars
* False shows no errors
* True or 'fc' for Feldman-Cousin errors.
For > 20 events, central Poisson intervals are used
* 'central' for central Poisson confidence intervals
* 'sqrtn' for sqrt(n) errors
* 'model_quantiles' for central Poisson inverval assuming
bin content is the Poisson mean (expectation), NOT a
confidence interval
:param errorstyle: How to plot errors (if errors is not False)
* 'bar' for error bars
* 'band' for shaded bands
:param error_alpha: Alpha multiplier for errorstyle='band'
:param plt: Object to call plt... on; matplotlib.pyplot by default.
:param set_xlim: If True, set xlim to the range of the hist
"""
if not CAN_PLOT:
raise ValueError(
"matplotlib did not import, so can't plot your histogram...")
x, y, ylow, yhigh = self.data_for_plot(
normed=normed,
scale_histogram_by=scale_histogram_by,
scale_errors_by=scale_errors_by,
errors=errors,
error_style=error_style)
if errors and error_style == 'bar':
# Plot as points with error bars
kwargs.setdefault('linestyle', 'none')
kwargs.setdefault('marker', '.')
plt.errorbar(x,
y,
yerr=[y - ylow, yhigh - y],
**kwargs)
else:
if errors and error_style == 'band':
line = plt.plot(x, y, drawstyle='steps-pre', **kwargs)
kwargs['color'] = line[0].get_color()
kwargs['alpha'] = error_alpha * kwargs.get('alpha', 1)
kwargs['linewidth'] = 0
if 'label' in kwargs:
# Don't want to double-label!
del kwargs['label']
plt.fill_between(x, ylow, yhigh,
step='pre', **kwargs)
else:
plt.plot(x, y, drawstyle='steps-pre', **kwargs)
if set_xlim:
plt.xlim(x[0], x[-1])
def percentile(self, percentile):
"""Return bin center nearest to percentile"""
return self.bin_centers[np.argmin(np.abs(self.cumulative_density * 100 - percentile))]
def lookup(self, coordinates):
"""Lookup values at coordinates.
coordinates: arraylike of coordinates.
Clips if out of range!! TODO: option to throw exception instead.
TODO: Needs tests!!
"""
# Convert coordinates to indices
index_array = np.clip(np.searchsorted(self.bin_edges, coordinates) - 1,
0,
len(self.bin_edges) - 2)
# Use the index array to slice the histogram
return self.histogram[index_array]
class Histdd(MultiHistBase):
"""multidimensional histogram object
"""
axis_names = None
@classmethod
def from_histogram(cls, histogram, bin_edges, axis_names=None):
"""Make a HistdD from numpy histogram + bin edges
:param histogram: Initial histogram
:param bin_edges: x bin edges of histogram, y bin edges, ...
:param axis_names: Names of axes
:return: Histnd instance
"""
self = cls(bins=bin_edges, axis_names=axis_names)
self.histogram = histogram
return self
def __init__(self, *data, **kwargs):
for k, v in {'bins': 10, 'range': None, 'weights': None, 'axis_names': None}.items():
kwargs.setdefault(k, v)
# dimensions is a shorthand [(axis_name_1, bins_1), (axis_name_2, bins_2), ...]
if 'dimensions' in kwargs:
kwargs['axis_names'], kwargs['bins'] = zip(*kwargs['dimensions'])
if len(data) == 0:
if kwargs['range'] is None:
if kwargs['bins'] is None:
raise ValueError("Must specify data, bins, or range")
try:
dimensions = len(kwargs['bins'])
except TypeError:
raise ValueError("If you specify no data and no ranges, must specify a bin specification "
"which tells me what dimension you want. E.g. [10, 10, 10] instead of 10.")
else:
dimensions = len(kwargs['range'])
data = np.zeros((0, dimensions)).T
self.axis_names = kwargs['axis_names']
self.histogram, self.bin_edges = self._data_to_hist(data, **kwargs)
def add(self, *data, **kwargs):
self.histogram += self._data_to_hist(data, **kwargs)[0]
@staticmethod
def _is_columnar(x):
if isinstance(x, COLUMNAR_DATA_SOURCES):
return True
if isinstance(x, np.ndarray) and x.dtype.fields:
return True
return False
def _data_to_hist(self, data, **kwargs):
"""Return bin_edges, histogram array"""
if hasattr(self, 'bin_edges'):
kwargs.setdefault('bins', self.bin_edges)
if len(data) == 1 and self._is_columnar(data[0]):
data = data[0]
if self.axis_names is None:
raise ValueError("When histogramming from a columnar data source, "
"axis_names or dimensions is mandatory")
is_dask = False
if WE_HAVE_DASK:
is_dask = isinstance(data, dask.dataframe.DataFrame)
if is_dask:
fake_histogram = Histdd(axis_names=self.axis_names, bins=kwargs['bins'])
partial_hists = []
for partition in data.to_delayed():
ph = dask.delayed(Histdd)(partition, axis_names=self.axis_names, bins=kwargs['bins'])
ph = dask.delayed(lambda x: x.histogram)(ph)
ph = dask.array.from_delayed(ph,
shape=fake_histogram.histogram.shape,
dtype=fake_histogram.histogram.dtype)
partial_hists.append(ph)
partial_hists = dask.array.stack(partial_hists, axis=0)
compute_options = kwargs.get('compute_options', {})
for k, v in DEFAULT_DASK_COMPUTE_KWARGS.items():
compute_options.setdefault(k, v)
histogram = partial_hists.sum(axis=0).compute(**compute_options)
bin_edges = fake_histogram.bin_edges
return histogram, bin_edges
else:
data = np.vstack([data[x].values if isinstance(data, pd.DataFrame) else data[x]
for x in self.axis_names])
data = np.array(data).T
return np.histogramdd(data,
bins=kwargs.get('bins'),
weights=kwargs.get('weights'),
range=kwargs.get('range'))
@property
def dimensions(self):
return len(self.bin_edges)
##
# Axis selection
##
def get_axis_number(self, axis):
if isinstance(axis, int):
return axis
if isinstance(axis, str):
if self.axis_names is None:
raise ValueError("Axis name %s not in histogram: histogram has no named axes." % axis)
if axis in self.axis_names:
return self.axis_names.index(axis)
raise ValueError("Axis name %s not in histogram. Axis names which are: %s" % (axis, self.axis_names))
raise ValueError("Argument to get_axis_number should be string or integer, but you gave %s" % axis)
def other_axes(self, axis):
axis = self.get_axis_number(axis)
return tuple([i for i in range(self.dimensions) if i != axis])
def axis_names_without(self, axis):
"""Return axis names without axis, or None if axis_names is None"""
if self.axis_names is None:
return None
return itemgetter(*self.other_axes(axis))(self.axis_names)
##
# Bin wrangling: centers <-> edges, values <-> indices
##
def bin_centers(self, axis=None):
"""Return bin centers along an axis, or if axis=None, list of bin_centers along each axis"""
if axis is None:
return [self.bin_centers(axis=i) for i in range(self.dimensions)]
axis = self.get_axis_number(axis)
return 0.5 * (self.bin_edges[axis][1:] + self.bin_edges[axis][:-1])
def get_axis_bin_index(self, value, axis):
"""Returns index along axis of bin in histogram which contains value
Inclusive on both endpoints
"""
axis = self.get_axis_number(axis)
bin_edges = self.bin_edges[axis]
return _find_axis_bin_index(value, bin_edges, axis)
def get_bin_indices(self, values):
"""Returns index tuple in histogram of bin which contains value"""
return tuple([self.get_axis_bin_index(values[ax_i], ax_i)
for ax_i in range(self.dimensions)])
def all_axis_bin_centers(self, axis):
"""Return ndarray of same shape as histogram containing bin center value along axis at each point"""
# Arcane hack that seems to work, at least in 3d... hope
axis = self.get_axis_number(axis)
return np.meshgrid(*self.bin_centers(), indexing='ij')[axis]
##
# Data reduction: sum, slice, project, ...
##
def sum(self, axis):
"""Sums all data along axis, returns d-1 dimensional histogram"""
axis = self.get_axis_number(axis)
if self.dimensions == 2:
new_hist = Hist1d
else:
new_hist = Histdd
return new_hist.from_histogram(np.sum(self.histogram, axis=axis),
bin_edges=itemgetter(*self.other_axes(axis))(self.bin_edges),
axis_names=self.axis_names_without(axis))
def slice(self, start, stop=None, axis=0):
"""Restrict histogram to bins whose data values (not bin numbers) along axis are between start and stop
(both inclusive). Returns d dimensional histogram."""
if stop is None:
# Make a 1=bin slice
stop = start
axis = self.get_axis_number(axis)
start_bin = max(0, self.get_axis_bin_index(start, axis))
stop_bin = min(len(self.bin_centers(axis)) - 1, # TODO: test off by one!
self.get_axis_bin_index(stop, axis))
new_bin_edges = self.bin_edges.copy()
new_bin_edges[axis] = new_bin_edges[axis][start_bin:stop_bin + 2] # TODO: Test off by one here!
return Histdd.from_histogram(np.take(self.histogram, np.arange(start_bin, stop_bin + 1), axis=axis),
bin_edges=new_bin_edges, axis_names=self.axis_names)
def slicesum(self, start, stop=None, axis=0):
"""Slices the histogram along axis, then sums over that slice, returning a d-1 dimensional histogram"""
return self.slice(start, stop, axis).sum(axis)
def projection(self, axis):
"""Sums all data along all other axes, then return Hist1D"""
axis = self.get_axis_number(axis)
projected_hist = np.sum(self.histogram, axis=self.other_axes(axis))
return Hist1d.from_histogram(projected_hist, bin_edges=self.bin_edges[axis])
##
# Density methods: cumulate, normalize, ...
##
def cumulate(self, axis):
"""Returns new histogram with all data cumulated along axis."""
axis = self.get_axis_number(axis)
return Histdd.from_histogram(np.cumsum(self.histogram, axis=axis),
bin_edges=self.bin_edges,
axis_names=self.axis_names)
def _simsalabim_slice(self, axis):
return [slice(None) if i != axis else np.newaxis
for i in range(self.dimensions)]
def normalize(self, axis):
"""Returns new histogram where all values along axis (in one bin of the other axes) sum to 1"""
axis = self.get_axis_number(axis)
sum_along_axis = np.sum(self.histogram, axis=axis)
# Don't do anything for subspaces without any entries -- this avoids nans everywhere
sum_along_axis[sum_along_axis == 0] = 1
hist = self.histogram / sum_along_axis[tuple(self._simsalabim_slice(axis))]
return Histdd.from_histogram(hist,
bin_edges=self.bin_edges,
axis_names=self.axis_names)
def cumulative_density(self, axis):
"""Returns new histogram with all values replaced by their cumulative densities along axis."""
return self.normalize(axis).cumulate(axis)
def central_likelihood(self, axis):
"""Returns new histogram with all values replaced by their central likelihoods along axis."""
result = self.cumulative_density(axis)
result.histogram = 1 - 2 * np.abs(result.histogram - 0.5)
return result
##
# Mixed methods: both reduce and summarize the data
##
def percentile(self, percentile, axis, inclusive=True):
"""Returns d-1 dimensional histogram containing percentile of values along axis
if inclusive=True, will report bin center of first bin for which percentile% of data lies in or below the bin
=False, ... data lies strictly below the bin
10% percentile is calculated as: value at least 10% data is LOWER than
"""
axis = self.get_axis_number(axis)
# Shape of histogram
s = self.histogram.shape
# Shape of histogram after axis has been collapsed to 1
s_collapsed = list(s)
s_collapsed[axis] = 1
# Shape of histogram with axis removed entirely
s_removed = np.concatenate([s[:axis], s[axis + 1:]]).astype(int)
# Using np.where here is too tricky, as it may not return a value for each "bin-columns"
# First, get an array which has a minimum at the percentile-containing bins
# The minimum may not be unique: if later bins are empty, they will not be
ecdf = self.cumulative_density(axis).histogram
if not inclusive:
raise NotImplementedError("Non-inclusive percentiles not yet implemented")
ecdf = np.nan_to_num(ecdf) # Since we're relying on self-equality later
x = ecdf - 2 * (ecdf >= percentile / 100)
# We now want to get the location of the minimum
# To ensure it is unique, add a very very very small monotonously increasing bit to x
# Nobody will want 1e-9th percentiles, right? TODO
sz = np.ones(len(s), dtype=int)
sz[axis] = -1
x += np.linspace(0, 1e-9, s[axis]).reshape(sz)
# 1. Find the minimum along the axis
# 2. Reshape to s_collapsed and perform == to get a mask
# 3. Apply the mask to the bin centers along axis
# 4. Unflatten with reshape
result = self.all_axis_bin_centers(axis)[
x == np.min(x, axis=axis).reshape(s_collapsed)
]
result = result.reshape(s_removed)
if self.dimensions == 2:
new_hist = Hist1d
else:
new_hist = Histdd
return new_hist.from_histogram(histogram=result,
bin_edges=itemgetter(*self.other_axes(axis))(self.bin_edges),
axis_names=self.axis_names_without(axis))
def average(self, axis):
"""Returns d-1 dimensional histogram of (estimated) mean value of axis
NB this is very different from averaging over the axis!!!
"""
axis = self.get_axis_number(axis)
avg_hist = np.ma.average(self.all_axis_bin_centers(axis),
weights=self.histogram, axis=axis)
if self.dimensions == 2:
new_hist = Hist1d
else:
new_hist = Histdd
return new_hist.from_histogram(histogram=avg_hist,
bin_edges=itemgetter(*self.other_axes(axis))(self.bin_edges),
axis_names=self.axis_names_without(axis))
def std(self, axis):
"""Returns d-1 dimensional histogram of (estimated) std value along axis
NB this is very different from just std of the histogram values (which describe bin counts)
"""
def weighted_std(values, weights, axis):
# Stolen from http://stackoverflow.com/questions/2413522
average = np.average(values, weights=weights, axis=axis)
average = average[self._simsalabim_slice(axis)]
variance = np.average((values-average)**2, weights=weights, axis=axis)
return np.sqrt(variance)
axis = self.get_axis_number(axis)
std_hist = weighted_std(self.all_axis_bin_centers(axis),
weights=self.histogram, axis=axis)
if self.dimensions == 2:
new_hist = Hist1d
else:
new_hist = Histdd
return new_hist.from_histogram(histogram=std_hist,
bin_edges=itemgetter(*self.other_axes(axis))(self.bin_edges),
axis_names=self.axis_names_without(axis))
##
# Other stuff
##
def bin_volumes(self):
return reduce(np.multiply, np.ix_(*[np.diff(bs) for bs in self.bin_edges]))
def rebin(self, *factors, **kwargs):
"""Return a new histogram that is 'rebinned' (zoomed) by factors (tuple of floats) along each dimensions
factors: tuple with zoom factors along each axis. e.g. 2 = double number of bins, 0.5 = halve them.
order: Order for spline interpolation in scipy.ndimage.zoom. Defaults to linear interpolation (order=1).
The only accepted keyword argument is 'order'!!! (python 2 is not nice)
The normalization is set to the normalization of the current histogram
The factors don't have to be integers or fractions: scipy.ndimage.zoom deals with the rebinning arcana.
"""
if not HAVE_SCIPY:
raise NotImplementedError("Rebinning requires scipy.ndimage")
if any([x != 'order' for x in kwargs.keys()]):
raise ValueError("Only 'order' keyword argument is accepted. Yeah, this is confusing.. blame python 2.")
order = kwargs.get('order', 1)
# Construct a new histogram
mh = self.similar_blank_histogram()
if not len(factors) == self.dimensions:
raise ValueError("You must pass %d rebin factors to rebin a %d-dimensional histogram" % (
self.dimensions, self.dimensions
))
# Zoom the bin edges.
# It's a bit tricky for non-uniform bins:
# we first construct a linear interpolator to take
# fraction along axis -> axis coordinate according to current binning.
# Then we feed it the new desired binning fractions.
for i, f in enumerate(factors):
x = self.bin_edges[i]
mh.bin_edges[i] = np.interp(
x=np.linspace(0, 1, round((len(x) - 1) * f) + 1),
xp=np.linspace(0, 1, len(x)),
fp=x)
# Rebin the histogram using ndimage.zoom, then renormalize
mh.histogram = zoom(self.histogram, factors, order=order)
if mh.histogram.sum() != 0:
mh.histogram *= self.histogram.sum() / mh.histogram.sum()
# mh.histogram /= np.product(factors)
return mh
def get_random(self, size=10):
"""Returns (size, n_dim) array of random variates from the histogram.
Inside the bins, a uniform distribution is assumed
Note this assumes the histogram is an 'events per bin', not a pdf.
TODO: test more.
"""
# Sample random bin centers
bin_centers_ravel = np.array(np.meshgrid(*self.bin_centers(),
indexing='ij')).reshape(self.dimensions, -1).T
hist_ravel = self.histogram.ravel()
hist_ravel = hist_ravel.astype(float)
hist_ravel = hist_ravel / np.nansum(hist_ravel)
result = bin_centers_ravel[np.random.choice(len(bin_centers_ravel),
p=hist_ravel,
size=size)]
# Randomize the position inside the bin
for dim_i in range(self.dimensions):
bin_edges = self.bin_edges[dim_i]
bin_widths = np.diff(bin_edges)
# Note the - 1: for the first bin's bin center, searchsorted gives 1, but we want 0 here:
index_of_bin = np.searchsorted(bin_edges, result[:, dim_i]) - 1
result[:, dim_i] += (np.random.rand(size) - 0.5) * bin_widths[index_of_bin]
return result
def lookup(self, *coordinate_arrays):
"""Lookup values at specific points.
coordinate_arrays: numpy arrays of coordinates, one for each dimension
e.g. lookup(np.array([0, 2]), np.array([1, 3])) looks up (x=0, y=1) and (x=2, y3).
Clips if out of range!! TODO: option to throw exception instead.
TODO: Needs tests!!
TODO: port to Hist1d... or finally join the classes
TODO: Support for scalar arguments
"""
assert len(coordinate_arrays) == self.dimensions
# Convert each coordinate array to an index array
index_arrays = [np.clip(np.searchsorted(self.bin_edges[i], coordinate_arrays[i]) - 1,
0,
len(self.bin_edges[i]) - 2)
for i in range(self.dimensions)]
# Use the index arrays to slice the histogram
return self.histogram[tuple(index_arrays)]
# Check against slow version:
# def hist_to_interpolator_slow(mh):
# bin_centers_ravel = np.array(np.meshgrid(*mh.bin_centers(), indexing='ij')).reshape(2, -1).T
# return NearestNDInterpolator(bin_centers_ravel, mh.histogram.ravel())
# x = np.random.uniform(0, 400, 100)
# y = np.random.uniform(0, 200, 100)
# hist_to_interpolator(mh)(x, y) - hist_to_interpolator_slow(mh)(x, y)
def lookup_hist(self, mh):
"""Return histogram within binning of Histdd mh, with values looked up in this histogram.
This is not rebinning: no interpolation /renormalization is performed.
It's just a lookup.
"""
result = mh.similar_blank_histogram()
points = np.stack([mh.all_axis_bin_centers(i)
for i in range(mh.dimensions)]).reshape(mh.dimensions, -1)
values = self.lookup(*points)
result.histogram = values.reshape(result.histogram.shape)
return result
def plot(self, log_scale=False, log_scale_vmin=1,
colorbar=True,
cblabel='Number of entries',
colorbar_kwargs=None,
plt=plt,
**kwargs):
if colorbar_kwargs is None:
colorbar_kwargs = dict()
colorbar_kwargs['label'] = cblabel
if not CAN_PLOT:
raise ValueError("matplotlib did not import, so can't plot your histogram...")
if self.dimensions == 1:
return Hist1d.from_histogram(self.histogram, self.bin_edges[0]).plot(**kwargs)
elif self.dimensions == 2:
if log_scale:
kwargs.setdefault('norm',
matplotlib.colors.LogNorm(
vmin=kwargs.pop('vmin', max(log_scale_vmin, self.histogram.min())),
vmax=kwargs.pop('vmax', self.histogram.max()))
)
mesh = plt.pcolormesh(self.bin_edges[0], self.bin_edges[1], self.histogram.T, **kwargs)
plt.xlim(np.min(self.bin_edges[0]), np.max(self.bin_edges[0]))
plt.ylim(np.min(self.bin_edges[1]), np.max(self.bin_edges[1]))
if self.axis_names:
plt.xlabel(self.axis_names[0])
plt.ylabel(self.axis_names[1])
if colorbar:
cb = plt.colorbar(**colorbar_kwargs)
cb.ax.minorticks_on()
return mesh, cb
return mesh
else:
raise ValueError("Can only plot 1- or 2-dimensional histograms!")
Histdd.project = Histdd.projection
if __name__ == '__main__':
# Create histograms just like from numpy...
m = Hist1d([0, 3, 1, 6, 2, 9], bins=3)
# ...or add data incrementally:
m = Hist1d(bins=100, range=(-3, 4))
m.add(np.random.normal(0, 0.5, 10**4))
m.add(np.random.normal(2, 0.2, 10**3))
# Get the data back out:
print(m.histogram, m.bin_edges)
if CAN_PLOT:
# Access derived quantities like bin_centers, normalized_histogram, density, cumulative_density, mean, std
plt.plot(m.bin_centers, m.normalized_histogram, label="Normalized histogram", linestyle='steps')
plt.plot(m.bin_centers, m.density, label="Empirical PDF", linestyle='steps')
plt.plot(m.bin_centers, m.cumulative_density, label="Empirical CDF", linestyle='steps')
plt.title("Estimated mean %0.2f, estimated std %0.2f" % (m.mean, m.std))
plt.legend(loc='best')
plt.show()
# Slicing and arithmetic behave just like ordinary ndarrays
print("The fourth bin has %d entries" % m[3])
m[1:4] += 4 + 2 * m[-27:-24]
print("Now it has %d entries" % m[3])
# Of course I couldn't resist adding a canned plotting function:
if CAN_PLOT:
m.plot()
plt.show()
# Create and show a 2d histogram. Axis names are optional.
m2 = Histdd(bins=100, range=[[-5, 3], [-3, 5]], axis_names=['x', 'y'])
m2.add(np.random.normal(1, 1, 10**6), np.random.normal(1, 1, 10**6))
m2.add(np.random.normal(-2, 1, 10**6), np.random.normal(2, 1, 10**6))
# x and y projections return Hist1d objects
m2.projection('x').plot(label='x projection')
m2.projection(1).plot(label='y projection')
plt.legend()
plt.show()
##
# Error bar helpers
##
# Zero-background 1 sigma Poisson Feldman-Cousins intervals
# From table II in https://arxiv.org/pdf/physics/9711021.pdf
_fc_intervals = np.array([
[0.0, 1.29],
[0.37, 2.75],
[0.74, 4.25],
[1.1, 5.3],
[2.34, 6.78],
[2.75, 7.81],
[3.82, 9.28],
[4.25, 10.3],
[5.3, 11.32],
[6.44, 12.79],
[6.78, 13.81],
[7.81, 14.82],
[8.83, 16.29],
[9.28, 17.3],
[10.3, 18.32],
[11.32, 19.32],
[12.33, 20.8],
[12.79, 21.81],
[13.81, 22.82],
[14.82, 23.82],
[15.83, 25.3]])
def poisson_central_interval(k, cl=0.6826894921370859):
"""Return central Poisson confidence interval
:param k: observed events
:param cl: confidence level
"""
if not HAVE_SCIPY:
raise NotImplementedError("Poisson errors require scipy")
# Adapted from https://stackoverflow.com/a/14832525
k = np.asarray(k).astype(int)
alpha = 1 - cl
low = stats.chi2.ppf(alpha / 2, 2 * k) / 2
high = stats.chi2.ppf(1 - alpha / 2, 2 * k + 2) / 2
return np.stack([np.nan_to_num(low), high])
def poisson_1s_interval(k, fc=True):
"""Return (low, high) 1 sigma Poisson confidence intervals
:param k: Observed events (int or array of ints)
:param fc: if True (default), use Feldman-Cousins for k <= 20,
and central intervals otherwise.
(at k = 20, the difference between these is 1-2%).
"""
k = np.asarray(k).astype(int)
result = poisson_central_interval(k)
if fc:
mask = k <= 20
result[:, mask] = _fc_intervals[k[mask]].T
return result
def _find_axis_bin_index(value, bin_edges, axis):
"""Returns index in bin_edges that contains value. Inclusive on both endpoints.
Raises CoordinateOutOfRangeException if value is out of range.
The axis argument is only used for formatting this error message.
"""
# The right bin edge of np.histogram is inclusive:
if value == bin_edges[-1]:
# Minus two: one for bin edges rather than centers, one for 0-based indexing
return len(bin_edges) - 2
# For all other bins, it is exclusive.
result = np.searchsorted(bin_edges, [value], side='right')[0] - 1
if not 0 <= result <= len(bin_edges) - 1:
raise CoordinateOutOfRangeException("Value %s is not in range (%s-%s) of axis %s" % (