-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcleaning.R
More file actions
96 lines (89 loc) · 5.03 KB
/
cleaning.R
File metadata and controls
96 lines (89 loc) · 5.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
library(dplyr)
library(readr)
library(lubridate)
library(janitor)
library(stringr)
library(labelled)
working_directory
## unit of measure data frame and expression words
uom_vars_df <- recode_file[["uom"]] %>%
janitor::clean_names() %>%
dplyr::distinct(uom_criteria, .keep_all = TRUE) %>%
dplyr::mutate(uom_criteria = gsub("S/KG"," /KG", uom_criteria, fixed = TRUE), #put a space before "/KG"
uom_criteria = gsub("S/PC"," /PC", uom_criteria, fixed = TRUE), #put a space before "/PC"
uom_regex = paste0("\\b", uom_criteria, "\\b"),
uom_regex = gsub(".","\\.", uom_regex, fixed = TRUE),
uom_regex = gsub("*","\\*", uom_regex, fixed = TRUE),
uom_regex = gsub("(", "\\(", uom_regex, fixed = TRUE)
)
uom_words <- regex(paste(c(uom_vars_df$uom_regex), sep = "", collapse = '|'),
ignore_case = TRUE)
## Clean food composition df
nutrient_composition_df <- (nutrient_composition_df %>%
janitor::clean_names() %>%
dplyr::mutate(across(where(is.character) & !c(nutrient_classification_document, food_group,
code, food_name, edible_conversion_factor),
~readr::parse_number(.x)
)
, across(!c(nutrient_classification_document, food_group,
code, food_name, edible_conversion_factor),
~if_else(.x == 0, NA, .x)
)
)
)
## unique items data frame joined with unit of measure data frame
unique_items_vars_df <- recode_file[["unique_items"]] %>%
janitor::clean_names() %>%
dplyr::distinct(description, .keep_all = TRUE) %>%
dplyr::left_join(nutrient_composition_df %>%
dplyr::select(-c(food_group, food_name, edible_conversion_factor)
),
by = c("nutrient_classification_document", "code")
) %>%
dplyr::select(-c(nutrient_classification_document, code)
) %>%
dplyr::mutate(uom_criteria = stringr::str_extract(description, uom_words)
) %>%
dplyr::left_join(uom_vars_df %>%
dplyr::select(-c(uom_regex, standard_uom_criteria)),
by = c("uom_criteria")
)
df_clean_a <- df_clean_supermarket_a %>%
dplyr::mutate(price = if_else(price == 0, NA, price) #Replacing price 0 with NA
, sdatetime = as.Date(sdatetime) #format doesn't store any time information
, month_date = lubridate::floor_date(sdatetime, unit = "month")
) %>%
dplyr::group_by(description, month_date) %>%
dplyr::mutate(price = if_else(is.na(price), round(mean(price, na.rm= TRUE),2),
price) #Replacing where price is NA with mean for items
) %>%
dplyr::ungroup() %>%
dplyr::mutate( total_new = price*quantity
, quantity_new = if_else((quantity %% 1) > 0, 1, quantity) #modulo operator with decimal numbers
, price_new = total_new/quantity_new
, customer_type = if_else(!is.na(id) | paymentmode == "Loyalty Redemption", "Loyalty card", "No-Loyalty card")
, month_name = lubridate::month(sdatetime, label = TRUE, abbr = FALSE)
, year = lubridate::year(sdatetime)
, quarter_date = as.factor(lubridate::quarter(sdatetime, type = "year.quarter"))
, age = round(lubridate::time_length(difftime(sdatetime, dob_new, units = "auto"),
unit = "year"),0 #calculating age
)
, age = if_else(age <18, NA, age) #convert less than 18 to missing
, age_group = if_else(age < 26, "18-25",
if_else(age < 36, "26-35",
if_else(age < 50, "36-49", "50 and above" )) #group age
)
) %>%
dplyr::left_join(unique_items_vars_df
, by = c("description")
) %>%
dplyr::mutate(quantity_uom = if_else(is.na(conversion), quantity
, quantity*conversion
)
, price_uom = total_new/quantity_uom
, across(c(price_uom), ~replace(.x, is.nan(.x), 0))
, class_name_uom = paste0(class_name, " (", standard_uom, ")")
) %>%
dplyr::filter(quantity > 0, item_type == "Food Item") %>%
labelled::set_variable_labels(!!!new_labels[names(new_labels) %in% names(.)] #labeling variables from named vector
)