|
| 1 | +""" |
| 2 | +Calculating the Pearson correlation coefficient between datasets |
| 3 | +================================================================ |
| 4 | +
|
| 5 | +In this recipe, we will take two datasets, one for an independent variable |
| 6 | +(in this example elevation) and one for a dependent variable (snow |
| 7 | +cover over a particuar day), regrid them to the same resolution then |
| 8 | +calculate the correlation coefficient, to get a measure of the relationship |
| 9 | +between them. |
| 10 | +
|
| 11 | +""" |
| 12 | + |
| 13 | +# %% |
| 14 | +# 1. Import cf-python, cf-plot and other required packages: |
| 15 | +import cfplot as cfp |
| 16 | +import cf |
| 17 | + |
| 18 | +import matplotlib.pyplot as plt |
| 19 | +import scipy.stats.mstats as mstats |
| 20 | + |
| 21 | +# %% |
| 22 | +# 2. Read the data in and unpack the Fields from FieldLists using indexing. |
| 23 | +# In our example We are investigating the influence of the land height on |
| 24 | +# the snow cover extent, so snow cover is the dependent variable. The snow |
| 25 | +# cover data is the |
| 26 | +# 'Snow Cover Extent 2017-present (raster 500 m), Europe, daily – version 1' |
| 27 | +# sourced from the Copernicus Land Monitoring Service which is described at: |
| 28 | +# https://land.copernicus.eu/en/products/snow/snow-cover-extent-europe-v1-0-500m |
| 29 | +# and the elevation data is the 'NOAA NGDC GLOBE topo: elevation data' dataset |
| 30 | +# which can be sourced from the IRI Data Library, or details found, at: |
| 31 | +# http://iridl.ldeo.columbia.edu/SOURCES/.NOAA/.NGDC/.GLOBE/.topo/index.html. |
| 32 | +orog = cf.read("~/recipes/1km_elevation.nc")[0] |
| 33 | +snow = cf.read("~/recipes/snowcover")[0] |
| 34 | + |
| 35 | +# %% |
| 36 | +# 3. Choose the day of pre-aggregated snow cover to investigate. We will |
| 37 | +# take the first datetime element corresponding to the first day from the |
| 38 | +# datasets, 1st January 2024, but by changing the indexing you can explore |
| 39 | +# other days by changing the index. We also get the string corresponding to |
| 40 | +# the date, to reference later: |
| 41 | +snow_day = snow[0] |
| 42 | +snow_day_dt = snow_day.coordinate("time")[0].data |
| 43 | +snow_day_daystring = f"{snow_day_dt.datetime_as_string[0].split(' ')[0]}" |
| 44 | + |
| 45 | +# %% |
| 46 | +# 4. Choose the region to consider to compare the relationship across, |
| 47 | +# which must be defined across both datasets, though not necessarily on the |
| 48 | +# same grid since we regrid to the same grid next and subspace to the same |
| 49 | +# area for both datasets ready for comparison in the next steps. By changing |
| 50 | +# the latitude and longitude points in the tuple below, you can change the |
| 51 | +# area that is used: |
| 52 | +region_in_mid_uk = ((-3.0, -1.0), (52.0, 55.0)) |
| 53 | +sub_orog = orog.subspace( |
| 54 | + longitude=cf.wi(*region_in_mid_uk[0]), latitude=cf.wi(*region_in_mid_uk[1]) |
| 55 | +) |
| 56 | +sub_snow = snow_day.subspace( |
| 57 | + longitude=cf.wi(*region_in_mid_uk[0]), latitude=cf.wi(*region_in_mid_uk[1]) |
| 58 | +) |
| 59 | + |
| 60 | +# %% |
| 61 | +# 5. Ensure data quality, since the standard name here corresponds to a |
| 62 | +# unitless fraction, but the values are in the tens, so we need to |
| 63 | +# normalise these to all lie between 0 and 1 and change the units |
| 64 | +# appropriately: |
| 65 | +sub_snow = ((sub_snow - sub_snow.minimum()) / (sub_snow.range())) |
| 66 | +sub_snow.override_units("1", inplace=True) |
| 67 | + |
| 68 | +# %% |
| 69 | +# 6. Regrid the data so that they lie on the same grid and therefore each |
| 70 | +# array structure has values with corresponding geospatial points that |
| 71 | +# can be statistically compared. Here the elevation field is regridded to the |
| 72 | +# snow field since the snow is higher-resolution, but the other way round is |
| 73 | +# possible by switching the field order: |
| 74 | +regridded_orog = sub_orog.regrids(sub_snow, method="linear") |
| 75 | + |
| 76 | +# %% |
| 77 | +# 7. Squeeze the snow data to remove the size 1 axes so we have arrays of |
| 78 | +# the same dimensions for each of the two fields to compare: |
| 79 | +sub_snow = sub_snow.squeeze() |
| 80 | + |
| 81 | +# %% |
| 82 | +# 8. Finally, perform the statistical calculation by using the SciPy method |
| 83 | +# to find the Pearson correlation coefficient for the two arrays now they are |
| 84 | +# in comparable form. Note we need to use 'scipy.stats.mstats' and not |
| 85 | +# 'scipy.stats' for the 'pearsonr' method, to account for masked |
| 86 | +# data in the array(s) properly: |
| 87 | +coefficient = mstats.pearsonr(regridded_orog.array, sub_snow.array) |
| 88 | +print(f"The Pearson correlation coefficient is: {coefficient}") |
| 89 | + |
| 90 | +# %% |
| 91 | +# 9. Make a final plot showing the two arrays side-by-side and quoting the |
| 92 | +# determined Pearson correlation coefficient to illustrate the relatoinship |
| 93 | +# and its strength visually. We use 'gpos' to position the plots in two |
| 94 | +# columns and apply some specific axes ticks and labels for clarity. |
| 95 | +cfp.gopen( |
| 96 | + rows=1, columns=2, top=0.85, |
| 97 | + file="snow_and_orog_on_same_grid.png", |
| 98 | + user_position=True, |
| 99 | +) |
| 100 | + |
| 101 | +# Joint configuration of the plots, including adding an overall title |
| 102 | +plt.suptitle( |
| 103 | + ( |
| 104 | + "Snow cover compared to elevation for the same area of the UK " |
| 105 | + f"aggregated across\n day {snow_day_daystring} with correlation " |
| 106 | + "coefficient (on the same grid) of " |
| 107 | + f"{coefficient.statistic:.4g} (4 s.f.)" |
| 108 | + ), |
| 109 | + fontsize=17, |
| 110 | +) |
| 111 | +cfp.mapset(resolution="10m") |
| 112 | +cfp.setvars(ocean_color="white", lake_color="white") |
| 113 | +label_info = { |
| 114 | + "xticklabels": ("3W", "2W", "1W"), |
| 115 | + "yticklabels": ("52N", "53N", "54N", "55N"), |
| 116 | + "xticks": (-3, -2, -1), |
| 117 | + "yticks": (52, 53, 54, 55), |
| 118 | +} |
| 119 | + |
| 120 | +# Plot the two contour plots as columns |
| 121 | +cfp.gpos(1) |
| 122 | +cfp.cscale("wiki_2_0_reduced") |
| 123 | +cfp.con( |
| 124 | + regridded_orog, |
| 125 | + lines=False, |
| 126 | + title="Elevation (from 1km-resolution orography)", |
| 127 | + colorbar_drawedges=False, |
| 128 | + **label_info, |
| 129 | +) |
| 130 | +cfp.gpos(2) |
| 131 | +# Don't add extentions on the colourbar since it can only be 0 to 1 inclusive |
| 132 | +cfp.levs(min=0, max=1, step=0.1, extend="neither") |
| 133 | +cfp.cscale("precip_11lev", ncols=11, reverse=1) |
| 134 | +cfp.con(sub_snow, lines=False, |
| 135 | + title="Snow cover extent (from satellite imagery)", |
| 136 | + colorbar_drawedges=False, |
| 137 | + **label_info |
| 138 | +) |
| 139 | + |
| 140 | +cfp.gclose() |
0 commit comments