Skip to content

Commit f8c7575

Browse files
committed
dev
1 parent ca69ad1 commit f8c7575

File tree

3 files changed

+192
-10
lines changed

3 files changed

+192
-10
lines changed

Changelog.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ version NEXTVERSION
33

44
**2024-??-??**
55

6+
* New keyword parameter to `cf.histogram`: ``density``
7+
(https://github.com/NCAS-CMS/cf-python/issues/794)
68
* New method: `cf.Field.is_discrete_axis`
79
(https://github.com/NCAS-CMS/cf-python/issues/784)
810
* Include the UM version as a field property when reading UM files

cf/maths.py

Lines changed: 69 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def relative_vorticity(
9292
) # pragman: no cover
9393

9494

95-
def histogram(*digitized):
95+
def histogram(*digitized, density=False):
9696
"""Return the distribution of a set of variables in the form of an
9797
N-dimensional histogram.
9898
@@ -151,6 +151,17 @@ def histogram(*digitized):
151151
manipulating the dimensions of the digitized field
152152
construct's data to ensure that broadcasting can occur.
153153
154+
density: `bool`, optional
155+
If False, the default, the result will contain the number
156+
of samples in each bin. If True, the result is the value
157+
of the probability density function at the bin, normalized
158+
such that the integral over the range is 1. Note that the
159+
sum of the histogram values will not be equal to 1 unless
160+
bins of unity size are chosen; it is not a probability
161+
mass function.
162+
163+
.. versionadded:: NEXTVERSION
164+
154165
:Returns:
155166
156167
`Field`
@@ -197,7 +208,7 @@ def histogram(*digitized):
197208
[0.1174 0.1317]
198209
[0.1317 0.146 ]]
199210
>>> h = cf.histogram(indices)
200-
>>> rint(h)
211+
>>> print(h)
201212
Field: number_of_observations
202213
-----------------------------
203214
Data : number_of_observations(specific_humidity(10)) 1
@@ -216,7 +227,21 @@ def histogram(*digitized):
216227
[0.1031 0.1174]
217228
[0.1174 0.1317]
218229
[0.1317 0.146 ]]
230+
>>> p = cf.histogram(indices, density=True)
231+
>>> print(p)
232+
Field: long_name=probability density function
233+
---------------------------------------------
234+
Data : long_name=probability density function(specific_humidity(10))
235+
Cell methods : latitude: longitude: point
236+
Dimension coords: specific_humidity(10) = [0.01015, ..., 0.13885] 1
237+
>>> print(p.data.round(2).array))
238+
[15.73 12.24 15.73 6.99 8.74 1.75 1.75 1.75 3.5 1.75]
239+
240+
The sum of the density values multiplied by the bin sizes is
241+
unity:
219242
243+
>>> print((p * p.dimension_coordinate().cellsize).sum().array)
244+
1.0
220245
221246
Create a two-dimensional histogram based on specific humidity and
222247
temperature bins. The temperature bins in this example are derived
@@ -225,8 +250,8 @@ def histogram(*digitized):
225250
226251
>>> g = f.copy()
227252
>>> g.standard_name = 'air_temperature'
228-
>>> import numpy
229-
>>> g[...] = numpy.random.normal(loc=290, scale=10, size=40).reshape(5, 8)
253+
>>> import numpy as np
254+
>>> g[...] = np.arange(40).reshape(5, 8) + 253.15
230255
>>> g.override_units('K', inplace=True)
231256
>>> print(g)
232257
Field: air_temperature (ncvar%q)
@@ -247,13 +272,28 @@ def histogram(*digitized):
247272
Dimension coords: air_temperature(5) = [281.1054839143287, ..., 313.9741786365939] K
248273
: specific_humidity(10) = [0.01015, ..., 0.13885] 1
249274
>>> print(h.array)
250-
[[2 1 5 3 2 -- -- -- -- --]
251-
[1 1 2 -- 1 -- 1 1 -- --]
252-
[4 4 2 1 1 1 -- -- 1 1]
253-
[1 1 -- -- 1 -- -- -- 1 --]
254-
[1 -- -- -- -- -- -- -- -- --]]
275+
[[3 3 2 -- -- -- -- -- -- --]
276+
[1 1 2 1 3 -- -- -- -- --]
277+
[1 -- -- 1 -- 1 1 1 2 1]
278+
[2 1 1 2 2 -- -- -- -- --]
279+
[2 2 4 -- -- -- -- -- -- --]]
255280
>>> h.sum()
256281
<CF Data(): 40 1>
282+
>>> p = cf.histogram(indices, indices_t, density=True)
283+
>>> print(p)
284+
Field: long_name=probability density function
285+
---------------------------------------------
286+
Data : long_name=probability density function(air_temperature(5), specific_humidity(10))
287+
Cell methods : latitude: longitude: point
288+
Dimension coords: air_temperature(5) = [257.05, ..., 288.25] K
289+
: specific_humidity(10) = [0.01015, ..., 0.13885] 1
290+
>>> print(p.array)
291+
>>> print(p.data.round(2).array))
292+
[[0.67 0.67 0.45 -- -- -- -- -- -- --]
293+
[0.22 0.22 0.45 0.22 0.67 -- -- -- -- --]
294+
[0.22 -- -- 0.22 -- 0.22 0.22 0.22 0.45 0.22]
295+
[0.45 0.22 0.22 0.45 0.45 -- -- -- -- --]
296+
[0.45 0.45 0.9 -- -- -- -- -- -- --]]
257297
258298
"""
259299
if not digitized:
@@ -264,7 +304,26 @@ def histogram(*digitized):
264304
f = digitized[0].copy()
265305
f.clear_properties()
266306

267-
return f.bin("sample_size", digitized=digitized)
307+
out = f.bin("sample_size", digitized=digitized)
308+
309+
if density:
310+
# Get the measure of each bin. This is the outer product of
311+
# the cell sizes of each dimension coordinate construct.
312+
data_axes = out.get_data_axes()
313+
dim = out.dimension_coordinate(filter_by_axis=(data_axes[0],))
314+
bin_measures = dim.cellsize
315+
for axis in data_axes[1:]:
316+
dim = out.dimension_coordinate(filter_by_axis=(axis,))
317+
bin_measures.outerproduct(dim.cellsize, inplace=True)
318+
319+
# Convert counts to densities
320+
out /= out.data.sum() * bin_measures
321+
322+
out.override_units(Units(), inplace=True)
323+
out.del_property("standard_name", None)
324+
out.set_property("long_name", "probability density function")
325+
326+
return out
268327

269328

270329
def curl_xy(fx, fy, x_wrap=None, one_sided_at_boundary=False, radius=None):

cf/test/test_Maths.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
faulthandler.enable() # to debug seg faults and timeouts
66

7+
import numpy as np
8+
79
import cf
810

911

@@ -207,6 +209,125 @@ def test_differential_operators(self):
207209
zeros[...] = 0
208210
self.assertTrue(cg.data.equals(zeros.data, rtol=0, atol=1e-15))
209211

212+
def test_histogram(self):
213+
f = cf.example_field(0)
214+
g = f.copy()
215+
g.standard_name = "air_temperature"
216+
g[...] = np.arange(40).reshape(5, 8) + 253.15
217+
g.override_units("K", inplace=True)
218+
219+
# 1-d histogram
220+
indices = f.digitize(10)
221+
h = cf.histogram(indices)
222+
self.assertTrue((h.array == [9, 7, 9, 4, 5, 1, 1, 1, 2, 1]).all)
223+
h = cf.histogram(indices, density=True)
224+
self.assertTrue(
225+
(
226+
h.array
227+
== [
228+
15.734265734265733,
229+
12.237762237762242,
230+
15.734265734265733,
231+
6.9930069930069925,
232+
8.74125874125874,
233+
1.748251748251749,
234+
1.7482517482517475,
235+
1.748251748251749,
236+
3.496503496503498,
237+
1.748251748251744,
238+
]
239+
).all
240+
)
241+
integral = (h * h.dimension_coordinate().cellsize).sum()
242+
self.assertEqual(integral.array, 1)
243+
244+
# 2-d histogram
245+
indices_t = g.digitize(5)
246+
h = cf.histogram(indices, indices_t)
247+
self.assertEqual(h.Units, cf.Units())
248+
self.assertTrue(
249+
(
250+
h.array
251+
== [
252+
[3, 3, 2, -1, -1, -1, -1, -1, -1, -1],
253+
[1, 1, 2, 1, 3, -1, -1, -1, -1, -1],
254+
[1, -1, -1, 1, -1, 1, 1, 1, 2, 1],
255+
[2, 1, 1, 2, 2, -1, -1, -1, -1, -1],
256+
[2, 2, 4, -1, -1, -1, -1, -1, -1, -1],
257+
]
258+
).all()
259+
)
260+
261+
h = cf.histogram(indices, indices_t, density=True)
262+
self.assertEqual(h.Units, cf.Units())
263+
self.assertTrue(
264+
(
265+
h.array
266+
== [
267+
[
268+
0.6724045185583661,
269+
0.6724045185583664,
270+
0.44826967903891074,
271+
-1,
272+
-1,
273+
-1,
274+
-1,
275+
-1,
276+
-1,
277+
-1,
278+
],
279+
[
280+
0.22413483951945457,
281+
0.2241348395194546,
282+
0.44826967903890913,
283+
0.22413483951945457,
284+
0.6724045185583637,
285+
-1,
286+
-1,
287+
-1,
288+
-1,
289+
-1,
290+
],
291+
[
292+
0.22413483951945457,
293+
-1,
294+
-1,
295+
0.22413483951945457,
296+
-1,
297+
0.2241348395194547,
298+
0.22413483951945448,
299+
0.2241348395194547,
300+
0.4482696790389094,
301+
0.224134839519454,
302+
],
303+
[
304+
0.4482696790389124,
305+
0.22413483951945626,
306+
0.2241348395194562,
307+
0.4482696790389124,
308+
0.4482696790389124,
309+
-1,
310+
-1,
311+
-1,
312+
-1,
313+
-1,
314+
],
315+
[
316+
0.4482696790389059,
317+
0.44826967903890597,
318+
0.8965393580778118,
319+
-1,
320+
-1,
321+
-1,
322+
-1,
323+
-1,
324+
-1,
325+
-1,
326+
],
327+
]
328+
).all()
329+
)
330+
210331

211332
if __name__ == "__main__":
212333
print("Run date:", datetime.datetime.now())

0 commit comments

Comments
 (0)