Skip to content

Commit e19f709

Browse files
committed
Adding/editing documentation for review
1 parent 0cbcffe commit e19f709

12 files changed

+423
-81
lines changed
Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
---
2+
title: "EPC data: technical documentation"
3+
output:
4+
word_document:
5+
reference_doc: SERL_word_template_portrait1.docx
6+
toc: no
7+
---
8+
9+
10+
```{r setup, include=FALSE}
11+
knitr::opts_chunk$set(echo = FALSE)
12+
13+
library(data.table)
14+
library(lubridate)
15+
library(knitr)
16+
library(captioner)
17+
library(flextable)
18+
19+
20+
21+
fig_caps <- captioner(prefix = "Figure")
22+
tab_caps <- captioner(prefix = "Table")
23+
24+
```
25+
26+
```{r defineInputs, include=FALSE}
27+
28+
load_epc <- FALSE # FALSE if using previously saved .RData
29+
write_epc <- FALSE # TRUE if we want to save a new csv data file
30+
31+
epc_folder <- "S:/ENERGINST_EaB_Project_17_SMRP/Data/Researcher data/EPC data/"
32+
epc_loading_filename <- "SERL EPC Data.csv"
33+
saving_location <- "S:/ENERGINST_EaB_Project_17_SMRP/Data/Researcher data/JulyStaticDataset/"
34+
epc_saving_filename <- "SERL_EPC_data_v2020_07"
35+
36+
```
37+
38+
```{r functions, include = FALSE}
39+
40+
get.cases <- function(i, d = epc) {
41+
v <- colnames(d)[i]
42+
tmp <- epc[, .N, keyby = v]
43+
tmp2 <- data.table(Variable = rep(v, nrow(tmp)),
44+
Value = tmp[, 1],
45+
Cases = tmp[, 2])
46+
colnames(tmp2) <- c("Variable", "Value", "Cases")
47+
return(tmp2)
48+
}
49+
50+
my.flex <- function(t, autofit = FALSE) {
51+
ft <- flextable(t)
52+
ft <- theme_booktabs(ft)
53+
ft <- theme_zebra(ft,
54+
odd_header = rgb(84/255, 141/255, 212/255, 1),
55+
even_body = "transparent",
56+
odd_body = rgb(242/255, 242/255, 242/255, 1))
57+
ft <- color(ft, color = "white", part = "header")
58+
ft <- italic(ft, italic = TRUE, part = "header")
59+
ft <- bold(ft, bold = FALSE, part = "header")
60+
if(autofit == TRUE) {
61+
ft <- autofit(ft)
62+
} else{
63+
ft <- set_table_properties(ft, layout = "autofit")
64+
}
65+
return(ft)
66+
}
67+
68+
```
69+
70+
```{r loadData, include=FALSE}
71+
72+
if(load_epc == TRUE) {
73+
epc_orig <- fread(paste(epc_folder, epc_loading_filename, sep = ""))
74+
epc <- copy(epc_orig)
75+
setnames(epc, old = "puprn", new = "PUPRN")
76+
save(epc,
77+
file = paste(saving_location, epc_saving_filename, ".RData", sep = ""))
78+
} else {
79+
load(paste(saving_location, epc_saving_filename, ".RData", sep = ""))
80+
}
81+
82+
if(write_epc == TRUE) {
83+
fwrite(epc,
84+
file = paste(saving_location, epc_saving_filename, ".csv", sep = ""))
85+
}
86+
87+
```
88+
89+
```{r docInfoTable, include = FALSE}
90+
info.tab <- data.table(
91+
attributes = c("Creation date",
92+
"Data version",
93+
"Author",
94+
"Project",
95+
"Organisation"),
96+
info = c(
97+
"2020-08-22",
98+
"2020-07",
99+
"Ellen Webborn",
100+
"Smart Energy Research Lab (SERL)",
101+
"University College London (UCL)")
102+
)
103+
104+
```
105+
106+
```{r}
107+
kable(info.tab)
108+
```
109+
110+
```{r captions, include = FALSE}
111+
112+
tab_caps(name = "epc_fields",
113+
caption = "All EPC variables, the number of unique values found for each variable, the variable (R) class, and an example from the dataset.")
114+
115+
tab_caps(name = "summary_short",
116+
caption = "The number and percent of each value found in the dataset for each variable with fewer than 10 unique values found.")
117+
118+
tab_caps(name = "stats",
119+
caption = "Basic statistcs for integer and numeric variables. 'n' is the number of values used in the calculations (i.e. the non-NA values).")
120+
121+
```
122+
123+
```{r summary, include = FALSE}
124+
ncol_epc <- ncol(epc)
125+
nrow_epc <- nrow(epc)
126+
127+
# Create a summary of the values for each variable
128+
129+
test_list <- lapply(2:ncol_epc, get.cases)
130+
131+
summary <- rbindlist(test_list)
132+
summary <- rbind(data.table(Variable = "PUPRN",
133+
Value = "-",
134+
Cases = nrow_epc),
135+
summary)
136+
137+
summary[, Percent := round(Cases / nrow_epc * 100, 2)]
138+
139+
140+
epc_fields <- data.table(Variable = colnames(epc),
141+
class = lapply(epc, class),
142+
example = rep(NA_character_, ncol_epc),
143+
position = 1:ncol(epc)
144+
)
145+
setkey(epc_fields, Variable)
146+
tmp <- summary[, .N, keyby = Variable]
147+
epc_fields <- tmp[epc_fields]
148+
setnames(epc_fields,
149+
old = "N",
150+
new = "uniqueValues")
151+
152+
epc_fields[Variable == "PUPRN",
153+
uniqueValues := nrow_epc]
154+
155+
setkey(epc_fields, position)
156+
157+
set.seed(3)
158+
for(x in 1:ncol_epc) {
159+
epc_fields[x, example := epc[round(runif(n = 1, min = 1, max = nrow_epc), 0),
160+
x,
161+
with = FALSE]]
162+
}
163+
164+
# Replace cases that gave missing, NA, etc. values
165+
epc_fields[Variable == "floor_level",
166+
example := "Ground"]
167+
epc_fields[Variable == "flat_top_storey",
168+
example := "Y"]
169+
epc_fields[Variable == "flat_storey_count",
170+
example := "2"]
171+
epc_fields[Variable == "number_open_fireplaces",
172+
example := "0"]
173+
epc_fields[Variable == "floor_energy_eff",
174+
example := "Very Good"]
175+
epc_fields[Variable == "floor_env_eff",
176+
example := "Good"]
177+
epc_fields[Variable == "sheating_energy_eff",
178+
example := "N/A"] # No other responses
179+
epc_fields[Variable == "sheating_env_eff",
180+
example := "N/A"] # No other responses
181+
epc_fields[Variable == "roof_energy_eff",
182+
example := "Good"]
183+
epc_fields[Variable == "main_fuel",
184+
example := "oil (not community)"]
185+
epc_fields[Variable == "heat_loss_corridoor",
186+
example := "unheated corridor"]
187+
epc_fields[Variable == "unheated_corridor_length",
188+
example := "4.998"]
189+
epc_fields[Variable == "floor_height",
190+
example := "2.400"]
191+
192+
193+
for_short_summary <- epc_fields[uniqueValues < 10, Variable]
194+
195+
summary_short <- summary[Variable %in% for_short_summary | Variable == "PUPRN",]
196+
197+
for_stats <- epc_fields$Variable
198+
n <- length(for_stats)
199+
200+
for_stats_numeric <- c()
201+
for(i in 1:n) {
202+
if(class(epc[, get(for_stats[i])]) %in% c("numeric", "integer")) {
203+
for_stats_numeric <- c(for_stats_numeric,
204+
for_stats[i])
205+
}
206+
}
207+
208+
m <- length(for_stats_numeric)
209+
stats <- data.table(Variable = for_stats_numeric,
210+
N = rep(NA_integer_, m),
211+
Min = rep(NA_real_, m),
212+
Max = rep(NA_real_, m),
213+
Mean = rep(NA_real_, m)
214+
)
215+
216+
for(i in 1:m) {
217+
stats[i, N := epc[!is.na(get(stats[i, Variable])), .N]]
218+
stats[i, Min := min(epc[, get(stats[i, Variable])], na.rm = TRUE)]
219+
stats[i, Max := max(epc[, get(stats[i, Variable])], na.rm = TRUE)]
220+
stats[i, Mean := round(mean(epc[, get(stats[i, Variable])], na.rm = TRUE), 2)]
221+
stats[i, SD := round(sd(epc[, get(stats[i, Variable])], na.rm = TRUE), 2)]
222+
}
223+
224+
225+
226+
# Tables
227+
228+
## epc_fields
229+
epc_fields[, position := NULL]
230+
colnames(epc_fields) <- c("variable", "n unique values", "class", "example")
231+
epc_fields <- my.flex(epc_fields)
232+
233+
234+
## summary_short
235+
colnames(summary_short) <- c("variable", "value", "number", "percent")
236+
summary_short <- my.flex(summary_short)
237+
238+
## stats
239+
colnames(stats) <- c("variable", "n", "min", "max", "mean", "standard deviation")
240+
stats <- my.flex(stats)
241+
242+
```
243+
244+
# Introduction
245+
246+
This document describes the England and Wales Energy Performance Certificate (EPC) data collected for SERL participants, stored in the file *"SERL_EPC_data_v2020_07.csv"*. The data contains `r ncol_epc` columns and `r nrow_epc` rows (one row per participant with available EPC data). This document lists the EPC variables available along with basic information about the values for each variable such as number of unique values and statistics for numerical variables. A guide to the variables is available [here](https://epc.opendatacommunities.org/docs/guidance#glossary_domestic).
247+
248+
A few variables have been added to the EPC data since the data were collected (largely in October 2019, a few individual households had data retrieved later), and these will be made available in future SERL data releases. Data were collected with the Domestic Energy Performance Certificates API using the house and postcode (details [here](https://epc.opendatacommunities.org/docs/api/domestic)).
249+
250+
The data have not been modified from the original source except for the removal of address data (replaced with our PUPRN used in the other datasets).
251+
252+
# Data summary
253+
254+
Table 1 lists all variables currently available in the SERL EPC dataset. The number of unique values is given, alongside the R data class and an example value from the dataset.
255+
256+
`r tab_caps("epc_fields")`
257+
```{r}
258+
epc_fields
259+
```
260+
261+
For variables with fewer than 10 unique values in the EPC dataset, Table 2 shows the number of records with each value and the percent with this value (or non-value in the case of N/A or 'NO DATA!' etc.). We also include PUPRN to show the number of records.
262+
263+
`r tab_caps("summary_short")`
264+
```{r}
265+
summary_short
266+
```
267+
268+
Table 3 provides basic summary statistics for numeric variables. The column 'n' shows the number of values that were possible to include in the statistics (N/A and similar responses are excluded).
269+
270+
`r tab_caps("stats")`
271+
```{r}
272+
stats
273+
```
274+
275+
276+
277+
278+
279+
280+
281+
282+
283+
284+
285+
286+
287+
288+
31.9 KB
Binary file not shown.
224 KB
Binary file not shown.

0 commit comments

Comments
 (0)