forked from rdpeng/ExData_Plotting1
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload_data.R
More file actions
65 lines (59 loc) · 1.99 KB
/
load_data.R
File metadata and controls
65 lines (59 loc) · 1.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Since all assignment tasks rely on the same data and it takes long time to load it I have implemented
# data caching mechanism
zip_file <- "household_power_consumption.zip";
raw_file <- "household_power_consumption.txt";
url <- "https://d396qusza40orc.cloudfront.net/exdata%2Fdata%2Fhousehold_power_consumption.zip";
cache_file <- "_cachedHPC.Rda";
download <- function() {
if (!file.exists(zip_file)) {
message(" * Zip file not found.");
message(" -> downloading from original location")
download.file(url,method="curl",destfile=zip_file);
} else {
message(" * Zip file exists. Skipping download step");
}
}
read_table <- function() {
message(" -> Loading raw text file")
src <- raw_file;
if (!file.exists(raw_file)) {
message(" * Raw text file not found.")
download();
src <- unzip();
}
raw_data <- read.table(src, sep=";", na.strings="?",header=TRUE);
data <- filter(raw_data);
data
}
unzip <- function() {
message(" -> Extracting data");
socket_connection <- unz(zip_file, raw_file);
socket_connection
}
filter <- function(raw_data) {
message (" -> Processing data");
# Should we care about NA ?
data <- raw_data[(raw_data$Date=="1/2/2007" | raw_data$Date=="2/2/2007"),];
data$DateTime <- strptime(paste(data$Date, data$Time), "%d/%m/%Y %H:%M:%S");
data
}
load_data <- function() {
message("Loading data")
# First check if cache file already exists
if (!file.exists(cache_file)) {
# Since cached version of data does not exists the following actions will be taken
# Check if txt file exists. If not then check if zip file exists and unzip it
# Filter the data and parse date time
# Save to cache
message (" * Cached version not found");
table <- read_table();
message(" -> Saving data to cache file");
saveRDS(table,cache_file);
} else {
# Cached version exists. Load from the cache
message(" -> Loading data from cache file");
table <- readRDS(cache_file);
}
message("Done data preparation.");
table
}