|
1 | 1 | #' compute_auc |
2 | 2 | #' |
3 | 3 | #' internal function that computes two indices of global alignment (auc) between conversation partners for each dyad |
4 | | -#' @name compute_spearman |
| 4 | +#' @name compute_auc |
5 | 5 | #' @importFrom dplyr bind_rows |
| 6 | +#' @importFrom dplyr bind_cols |
| 7 | +#' @importFrom dplyr left_join |
6 | 8 | #' @importFrom dplyr mutate |
| 9 | +#' @importFrom dplyr lag |
7 | 10 | #' @importFrom dplyr select |
| 11 | +#' @importFrom dplyr group_by |
| 12 | +#' @importFrom dplyr ungroup |
| 13 | +#' @importFrom dplyr summarize |
| 14 | +#' @importFrom dplyr first |
| 15 | +#' @importFrom dplyr rename_with |
| 16 | +#' @importFrom dplyr across |
| 17 | +#' @importFrom dplyr everything |
| 18 | +#' @importFrom tidyselect contains |
| 19 | +#' @importFrom tidyr pivot_wider |
| 20 | +#' @importFrom tidyr fill |
8 | 21 | #' @importFrom magrittr %>% |
| 22 | +#' @importFrom zoo na.approx |
| 23 | +#' @importFrom DescTools AUC |
9 | 24 | #' @returns |
10 | 25 | #' nothing - internal function used for intermediary computation piped into summarize_dyads function |
11 | 26 | #' @keywords internal |
12 | 27 | #' @noRd |
13 | 28 |
|
14 | 29 | compute_auc <- function(df_prep, verbose = TRUE) { |
| 30 | + # additional_lags can be added as a parameter, and will accept a vector of ints |
| 31 | + additional_lags = c(1) |
| 32 | + # define the internal functions here: |
| 33 | + # function to calculate AUC for single time series, return NA if there is an error |
| 34 | + calculate_auc <- function(domain_ts, doc_name, dimension, verbose) { |
| 35 | + tryCatch({ |
| 36 | + # if time series has fewer points than the threshold, fill with NA |
| 37 | + if (max(domain_ts$Exchange_Count) < 3) { # hard coded to three exchanges |
| 38 | + #create a single row, single column dataframe with one empty value to fill in the AUC |
| 39 | + doc_domain_auc_df <- data.frame(domain_auc = as.double(NA), |
| 40 | + Exchanges = max(domain_ts$Exchange_Count)) |
| 41 | + } |
| 42 | + else { |
| 43 | + domain_ts <- data.frame(domain_ts) #make single emotion Time series a data frame |
| 44 | + # na.rm is enabled, which will drop any NA values, so any differences where one of the original scores=NA |
| 45 | + domain_auc <- DescTools::AUC(x = domain_ts[,1], y = domain_ts[,2], |
| 46 | + method = "trapezoid", na.rm = T) |
| 47 | + doc_domain_auc_df <- data.frame(domain_auc, |
| 48 | + Exchanges = max(domain_ts$Exchange_Count)) #make data frame of AUC, replicated once |
| 49 | + } |
| 50 | + doc_domain_auc_df |
| 51 | + }, |
| 52 | + error = function(e) { |
| 53 | + if (verbose) { |
| 54 | + message(paste("Results for dAUC will be filled with NA.\n\tTranscript:", |
| 55 | + doc_name, "\n\tDimension:", dimension)) |
| 56 | + } |
| 57 | + # fill the result cell with NA |
| 58 | + doc_domain_auc_df <- data.frame(domain_auc = as.double(NA), |
| 59 | + Exchanges = max(domain_ts$Exchange_Count)) |
| 60 | + doc_domain_auc_df |
| 61 | + }) |
| 62 | + } # end of ccalculate_auc() |
| 63 | + |
15 | 64 | # selects align_var by grepping on possible prefixes of dimensions |
16 | 65 | align_var <- grep("^(emo_|lex_|sem_|phon_)", colnames(df_prep), value = TRUE, ignore.case = TRUE) |
17 | 66 |
|
18 | 67 | # split the data frame into a list by event id |
19 | 68 | df_list <- split(df_prep, f = df_prep$Event_ID) |
20 | 69 |
|
| 70 | + # create a list of the first speaker in each dyad (S1) - for appending at end |
| 71 | + s1_col <- df_prep %>% |
| 72 | + dplyr::group_by(Event_ID) %>% |
| 73 | + dplyr::summarize(S1 = unique(Participant_ID)[1]) |
| 74 | + |
21 | 75 | # iterate over each event, replacing participant names with a transient filler variable |
22 | 76 | df_list_speakvar <- lapply(df_list, function(df){ |
23 | 77 | svec <- unique(as.character(df$Participant_ID)) |
@@ -76,101 +130,137 @@ compute_auc <- function(df_prep, verbose = TRUE) { |
76 | 130 | # join the two participant data frame together by dyad and Exchange_Count |
77 | 131 | widedf <- dplyr::left_join(split_pid_df_list[[1]], split_pid_df_list[[2]], by = c("Event_ID", "Participant_Pair", "Exchange_Count")) |
78 | 132 |
|
79 | | - #iterate over each aligned dimension, selecting only the scores for that dimension and pulling a difference value and subbing it in for the actual values |
80 | | - for (dimension in align_var){ |
81 | | - both_participant_cols <- widedf %>% |
82 | | - dplyr::select(starts_with(dimension)) |
83 | | - # add a count to the data frame (x-axis) |
84 | | - |
85 | | - absdiffcol <- data.frame(dimension = abs(both_participant_cols[,1] - both_participant_cols[,2])) |
86 | | - widedf[which(colnames(widedf) %in% paste(dimension, c("S1", "S2"), sep = "_"))] <- absdiffcol |
| 133 | + # create a list of wide data frames |
| 134 | + wide_df_list <- list(widedf) |
| 135 | + # get the length of the wide df list before adding lags - for naming columns |
| 136 | + start_wide_df_length <- length(wide_df_list) |
| 137 | + # if given, lag time series in widedf and add to list |
| 138 | + if (length(additional_lags > 0)) { |
| 139 | + # iteration for additional |
| 140 | + for (lag in additional_lags){ |
| 141 | + lagdf <- widedf |
| 142 | + val <- abs(lag) |
| 143 | + # if positive lag all S1, if negative, lag all S2 |
| 144 | + if (lag > 0) { |
| 145 | + lagdf <- widedf %>% |
| 146 | + dplyr::group_by(Event_ID) %>% |
| 147 | + dplyr::mutate(dplyr::across(tidyselect::contains("S1"), ~dplyr::lag(., n = val))) %>% |
| 148 | + dplyr::ungroup() |
| 149 | + } |
| 150 | + else if (lag < 0) { |
| 151 | + lagdf <- widedf %>% |
| 152 | + dplyr::group_by(Event_ID) %>% |
| 153 | + dplyr::mutate(dplyr::across(tidyselect::contains("S2"), ~dplyr::lag(., n = val))) %>% |
| 154 | + dplyr::ungroup() |
| 155 | + } |
| 156 | + wide_df_list[[length(wide_df_list)+1]] <- lagdf |
| 157 | + } |
87 | 158 | } |
88 | 159 |
|
89 | | - long_diff_df <- widedf %>% |
90 | | - tidyr::pivot_longer(cols = c(ends_with("_S1") | ends_with("_S2")), |
91 | | - names_to = c("dimension", "Participant_ID"), |
92 | | - names_pattern = "(.*)_([^_]+)$", |
93 | | - values_to = "score") %>% # pivot longer by dimension and pid |
94 | | - # pivot each dimension to a column |
95 | | - tidyr::pivot_wider(names_from = dimension, values_from = score) %>% |
96 | | - dplyr::filter(Participant_ID == "S1") %>% # remove every second row per turn |
97 | | - dplyr::select(-Participant_ID) # remove pid column |
98 | | - # split the difference data frame into a list based on event id |
99 | | - long_diff_df_list <- split(long_diff_df, f = long_diff_df$Event_ID) |
100 | | - |
101 | | - # grab the aligned dimensions as a vector to iterate over |
102 | | - xdimensions <- colnames(long_diff_df)[which(colnames(long_diff_df) %in% align_var)] |
103 | | - |
104 | | - domain_auc_list <- lapply(xdimensions, function(dimension){ #iterate over emotion |
105 | | - # now iterate over each dyad in the corpus |
106 | | - |
107 | | - # function to calculate AUC for single time series, return NA if there is an error |
108 | | - calculate_auc <- function(domain_ts, doc_name, dimension) { |
109 | | - tryCatch({ |
110 | | - # if time series has fewer points than the threshold, fill with NA |
111 | | - if (max(domain_ts$Exchange_Count) < 3) { # hard coded to three exchanges |
112 | | - #create a single row, single column dataframe with one empty value to fill in the AUC |
113 | | - doc_domain_auc_df <- data.frame(domain_auc = as.double(NA), |
114 | | - Exchanges = max(domain_ts$Exchange_Count)) |
115 | | - } |
116 | | - else { |
117 | | - domain_ts <- data.frame(domain_ts) #make single emotion Time series a data frame |
118 | | - domain_auc <- DescTools::AUC(x = domain_ts[,1], y = domain_ts[,2], method = "trapezoid") |
119 | | - doc_domain_auc_df <- data.frame(domain_auc, |
120 | | - Exchanges = max(domain_ts$Exchange_Count)) #make data frame of AUC, replicated once |
121 | | - } |
122 | | - doc_domain_auc_df |
123 | | - }, |
124 | | - error = function(e) { |
125 | | - if (verbose) { |
126 | | - message(paste("Results for dAUC will be filled with NA.\n\tTranscript:", |
127 | | - doc_name, "\n\tDimension:", dimension)) |
128 | | - } |
129 | | - # fill the result cell with NA |
130 | | - doc_domain_auc_df <- data.frame(domain_auc = as.double(NA), |
131 | | - Exchanges = max(domain_ts$Exchange_Count)) |
132 | | - doc_domain_auc_df |
133 | | - }) |
| 160 | + # iterate over the full computation of dAUC for each lag |
| 161 | + list_of_each_dauc_df <- lapply(seq_along(wide_df_list), function(i) { |
| 162 | + widedf <- wide_df_list[[i]] |
| 163 | + |
| 164 | + #iterate over each aligned dimension, selecting only the scores for that dimension and pulling a difference value and subbing it in for the actual values |
| 165 | + for (dimension in align_var){ |
| 166 | + both_participant_cols <- widedf %>% |
| 167 | + dplyr::select(starts_with(dimension)) |
| 168 | + absdiffcol <- data.frame(dimension = abs(both_participant_cols[,1] - both_participant_cols[,2])) |
| 169 | + widedf[which(colnames(widedf) %in% paste(dimension, c("S1", "S2"), sep = "_"))] <- absdiffcol |
134 | 170 | } |
135 | 171 |
|
136 | | - # iterate over each document |
137 | | - single_doc_auc <- lapply(long_diff_df_list, function(df_prep){ |
| 172 | + long_diff_df <- widedf %>% |
| 173 | + tidyr::pivot_longer(cols = c(ends_with("_S1") | ends_with("_S2")), |
| 174 | + names_to = c("dimension", "Participant_ID"), |
| 175 | + names_pattern = "(.*)_([^_]+)$", |
| 176 | + values_to = "score") %>% # pivot longer by dimension and pid |
| 177 | + # pivot each dimension to a column |
| 178 | + tidyr::pivot_wider(names_from = dimension, values_from = score) %>% |
| 179 | + dplyr::filter(Participant_ID == "S1") %>% # remove every second row per turn |
| 180 | + dplyr::select(-Participant_ID) # remove pid column |
| 181 | + # split the difference data frame into a list based on event id |
| 182 | + long_diff_df_list <- split(long_diff_df, f = long_diff_df$Event_ID) |
| 183 | + |
| 184 | + # grab the aligned dimensions as a vector to iterate over |
| 185 | + xdimensions <- colnames(long_diff_df)[which(colnames(long_diff_df) %in% align_var)] |
138 | 186 |
|
139 | | - domain_ts <- df_prep %>% |
140 | | - dplyr::select(Exchange_Count, |
141 | | - tidyselect::contains(dimension)) # take dimension and time |
| 187 | + # NOTE: calculate_auc function is defined here |
142 | 188 |
|
143 | | - # put the function in here |
144 | | - single_doc_domain_auc <- calculate_auc(domain_ts, |
145 | | - doc_name = as.character(df_prep$Event_ID)[1], |
146 | | - dimension = dimension) |
| 189 | + domain_auc_list <- lapply(xdimensions, function(dimension){ #iterate over emotion |
| 190 | + # iterate over each document |
| 191 | + single_doc_auc <- lapply(long_diff_df_list, function(df_prep){ |
| 192 | + |
| 193 | + domain_ts <- df_prep %>% |
| 194 | + dplyr::select(Exchange_Count, |
| 195 | + tidyselect::contains(dimension)) # take dimension and time |
| 196 | + |
| 197 | + # put the function in here |
| 198 | + single_doc_domain_auc <- calculate_auc(domain_ts, |
| 199 | + doc_name = as.character(df_prep$Event_ID)[1], |
| 200 | + dimension = dimension, |
| 201 | + verbose = verbose) |
| 202 | + }) |
| 203 | + #bind all docs AUCs for emotion into one column and add column prefix |
| 204 | + all_doc_domain_auc_df <- dplyr::bind_rows(single_doc_auc) |
| 205 | + # drop exchange column for all but first dimension |
| 206 | + if (dimension != xdimensions[1]) { |
| 207 | + all_doc_domain_auc_df <- data.frame(all_doc_domain_auc_df[,1]) |
| 208 | + } |
| 209 | + colnames(all_doc_domain_auc_df)[1] <- paste("AUC", dimension, sep = "_") |
| 210 | + all_doc_domain_auc_df |
147 | 211 | }) |
148 | | - #bind all docs AUCs for emotion into one column and add column prefix |
149 | | - all_doc_domain_auc_df <- dplyr::bind_rows(single_doc_auc) |
150 | | - # drop exchange column for all but first dimension |
151 | | - if (dimension != xdimensions[1]) { |
152 | | - all_doc_domain_auc_df <- data.frame(all_doc_domain_auc_df[,1]) |
| 212 | + |
| 213 | + all_domain_df <- dplyr::bind_cols(domain_auc_list, data.frame(Event_ID = names(long_diff_df_list))) #bind all columns of AUCs into one data frame |
| 214 | + # standardize each AUC to 50 |
| 215 | + all_domain_df_s <- all_domain_df %>% |
| 216 | + dplyr::mutate(dplyr::across(tidyselect::contains("AUC"), ~ (50 * .x) / Exchanges)) %>% |
| 217 | + dplyr::select(-Exchanges) |
| 218 | + |
| 219 | + output_auc <- dplyr::left_join(all_domain_df, all_domain_df_s, |
| 220 | + by = "Event_ID", suffix = c("_raw", "_scaled100")) %>% |
| 221 | + dplyr::select(c(Event_ID, Exchanges, dplyr::everything())) |
| 222 | + |
| 223 | + |
| 224 | + # add lag to columns if there is a lag |
| 225 | + if (i > start_wide_df_length) { |
| 226 | + # get lag from index |
| 227 | + lag_index <- i - start_wide_df_length |
| 228 | + output_auc <- output_auc %>% |
| 229 | + dplyr::rename_with(~ paste0(., "_Lag", |
| 230 | + as.character(additional_lags[lag_index])), |
| 231 | + tidyselect::starts_with("AUC")) |
| 232 | + } |
| 233 | + # if not the first iteration, remove event id and exchanges |
| 234 | + if (i > 1) { |
| 235 | + output_auc <- output_auc %>% |
| 236 | + dplyr::select(-c(Event_ID, Exchanges)) |
153 | 237 | } |
154 | | - colnames(all_doc_domain_auc_df)[1] <- paste("AUC", dimension, sep = "_") |
155 | | - all_doc_domain_auc_df |
| 238 | + output_auc |
| 239 | + }) |
| 240 | + |
| 241 | + # combine the data frames from different lagged time series |
| 242 | + combined_lag_df <- dplyr::bind_cols(list_of_each_dauc_df) %>% |
| 243 | + dplyr::mutate(Talked_First = s1_col$S1, .after = Exchanges) |
| 244 | + # sub out negative lag colnames for lead |
| 245 | + colnames(combined_lag_df) <- gsub("_Lag-", "_Lead", colnames(combined_lag_df)) |
| 246 | + # set the names of zero lag columsn to immediate |
| 247 | + immediate_ind <- grep(".*(raw|scaled100)$", colnames(combined_lag_df), ignore.case = F) |
| 248 | + #print(immediate_ind) |
| 249 | + colnames(combined_lag_df)[immediate_ind] <- sapply(colnames(combined_lag_df)[immediate_ind], function(x){ |
| 250 | + paste0(x, "_Immediate") |
156 | 251 | }) |
157 | 252 |
|
158 | | - all_domain_df <- dplyr::bind_cols(domain_auc_list, data.frame(Event_ID = names(long_diff_df_list))) #bind all columns of AUCs into one data frame |
| 253 | + #colnames(combined_lag_df)[immediate_ind] <- new_names |
| 254 | + |
159 | 255 | # throw warning if any dyads are fewer than 50 exchanges |
160 | | - small_dyads <- all_domain_df[which(all_domain_df$Exchanges < 50), "Event_ID"] |
| 256 | + small_dyads <- combined_lag_df[which(combined_lag_df$Exchanges < 50), "Event_ID"] |
| 257 | + small_dyads <- unique(small_dyads) |
161 | 258 | if (length(small_dyads) > 0) { |
162 | 259 | warning(paste0("Some conversations are shorter than 50 exchanges (100 turns). ", |
163 | 260 | "It is recommended that conversations are longer than 50 exchanges. ", |
164 | 261 | "Affected conversations:\n", |
165 | 262 | paste(small_dyads, collapse = ", "))) |
166 | 263 | } |
167 | | - # standardize each AUC to 50 |
168 | | - all_domain_df_s <- all_domain_df %>% |
169 | | - dplyr::mutate(dplyr::across(tidyselect::contains("AUC"), ~ (50 * .x) / Exchanges)) %>% |
170 | | - dplyr::select(-Exchanges) |
171 | | - |
172 | | - output_auc <- dplyr::left_join(all_domain_df, all_domain_df_s, by = "Event_ID", suffix = c("_raw", "_scaled100")) %>% |
173 | | - dplyr::select(c(Event_ID, Exchanges, dplyr::everything())) |
174 | 264 |
|
175 | | - return(output_auc) |
| 265 | + return(combined_lag_df) |
176 | 266 | } |
0 commit comments