Skip to content

Commit 5dc2a61

Browse files
committed
Merge branch 'CRAN_changesv2Jul25'
2 parents 3b4b925 + 517b562 commit 5dc2a61

19 files changed

+96
-152
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Authors@R: c(
1313
person("Chelsea", "Helion", role = "ctb"),
1414
person("Gus", "Cooney", role = "ctb"))
1515
Maintainer: Jamie Reilly <[email protected]>
16-
Description: Imports conversation transcripts into R, concatenates them into a single dataframe appending event identifiers, cleans and formats the text, then yokes user-specified psycholinguistic database values to each word. `ConversationAlign` then computes alignment indices between two interlocutors across each transcript for >40 possible semantic, lexical, and affective dimensions. In addition to alignment, `ConversationAlign` also produces a table of analytics (e.g., token count, type-token-ratio) in a summary table describing your particular text corpus.
16+
Description: Imports conversation transcripts into R, concatenates them into a single dataframe appending event identifiers, cleans and formats the text, then yokes user-specified psycholinguistic database values to each word. 'ConversationAlign' then computes alignment indices between two interlocutors across each transcript for >40 possible semantic, lexical, and affective dimensions. In addition to alignment, 'ConversationAlign' also produces a table of analytics (e.g., token count, type-token-ratio) in a summary table describing your particular text corpus.
1717
License: GPL (>= 3)
1818
Encoding: UTF-8
1919
Depends:

R/compute_auc.R

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,18 @@
11
#' compute_auc
22
#'
3-
#' computes auc between conversation partners for each dyad
3+
#' internal function that computes two indices of global alignment (auc) between conversation partners for each dyad
44
#' @name compute_spearman
55
#' @importFrom dplyr bind_rows
66
#' @importFrom dplyr mutate
77
#' @importFrom dplyr select
88
#' @importFrom magrittr %>%
9+
#' @returns
10+
#' nothing - internal function used for intermediary computation piped into summarize_dyads function
911
#' @keywords internal
1012
#' @noRd
1113

12-
13-
compute_auc <- function(df_prep) {
14-
#selects align_var by greppin on possible prefixes of dimensions
14+
compute_auc <- function(df_prep, verbose = TRUE) {
15+
# selects align_var by grepping on possible prefixes of dimensions
1516
align_var <- grep("^(emo_|lex_|sem_|phon_)", colnames(df_prep), value = TRUE, ignore.case = TRUE)
1617

1718
# split the data frame into a list by event id
@@ -26,7 +27,7 @@ compute_auc <- function(df_prep) {
2627

2728
df_speakvar <- dplyr::bind_rows(df_list_speakvar)
2829

29-
# group by turn then take the average score for each turn count,then pivot on pids
30+
# group by turn then take the average score for each turn count, then pivot on pids
3031
df_wide <- df_speakvar %>% dplyr::group_by(Event_ID, Exchange_Count, Participant_ID) %>%
3132
dplyr::summarise(dplyr::across(tidyselect::contains(align_var), ~ mean(.x, na.rm = TRUE)),
3233
participant_var = dplyr::first(participant_var), Participant_Pair = dplyr::first(Participant_Pair),
@@ -121,9 +122,10 @@ compute_auc <- function(df_prep) {
121122
doc_domain_auc_df
122123
},
123124
error = function(e) {
124-
# print file name and dimension that are behaving unexpectedly
125-
cat(paste("Results for dAUC will be filled with NA.\n\tTranscript: ",
126-
doc_name, "\n\tDimension: ", dimension, "\n", sep = ""))
125+
if (verbose) {
126+
message(paste("Results for dAUC will be filled with NA.\n\tTranscript:",
127+
doc_name, "\n\tDimension:", dimension))
128+
}
127129
# fill the result cell with NA
128130
doc_domain_auc_df <- data.frame(domain_auc = as.double(NA),
129131
Exchanges = max(domain_ts$Exchange_Count))
@@ -157,8 +159,10 @@ compute_auc <- function(df_prep) {
157159
# throw warning if any dyads are fewer than 50 exchanges
158160
small_dyads <- all_domain_df[which(all_domain_df$Exchanges < 50), "Event_ID"]
159161
if (length(small_dyads) > 0) {
160-
warning(paste0("Some conversations are shorter than 50 exchanges (100 turns). It is recomended that conversations are longer than 50 exchanges. Attached is a list of conversations with fewer than 50 exchanges:\n",
161-
paste(small_dyads, collapse = "\n")))
162+
warning(paste0("Some conversations are shorter than 50 exchanges (100 turns). ",
163+
"It is recommended that conversations are longer than 50 exchanges. ",
164+
"Affected conversations:\n",
165+
paste(small_dyads, collapse = ", ")))
162166
}
163167
# standardize each AUC to 50
164168
all_domain_df_s <- all_domain_df %>%

R/compute_lagcorr.R

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#'
22
#' computes lagged correlations alignment measure across partners within each conversation
33
#' @name compute_lagcorr
4+
#' @returns
5+
#' internal function to summarize_dyads that produces a dataframe with lagged correlations across turns (-2,0,2 as default) for each dimension of interest.
46
#' @importFrom dplyr bind_rows
57
#' @importFrom dplyr group_by
68
#' @importFrom dplyr mutate

R/corpus_analytics.R

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
#' Produces a table of corpus analytics including numbers of complete observations at each step, word counts, lexical diversity (e.g., TTR), stopword ratios, etc. Granularity of the summary statistics are guided by the user (e.g., by conversation, by conversation and speaker, collapsed all)
44
#' @name corpus_analytics
55
#' @param dat_prep takes dataframe produced from the df_prep() function
6-
#' @return dataframe with summary analytics for a conversation corpus
6+
#' @returns
7+
#' dataframe with summary statistics (mean, SD, range) for numerous corpus analytics (e.g., token count, type-token-ratio, word-count-per-turn) for the target conversation corpus. Summary data structured in table format for easy export to a journal method section.
78
#' @importFrom dplyr across
89
#' @importFrom dplyr bind_rows
910
#' @importFrom dplyr everything
@@ -51,21 +52,6 @@
5152
# TTR (clean): Group by Event_ID, distinct Text_Clean divided by Text_Clean
5253

5354
corpus_analytics <- function(dat_prep) {
54-
# Load required packages
55-
my_packages <- c("dplyr", "magrittr", "stringr", "tibble", "tidyr", "purrr", "stats", "tidyselect")
56-
for (pkg in my_packages) {
57-
if (!requireNamespace(pkg, quietly = TRUE)) {
58-
install.packages(pkg)
59-
}
60-
library(pkg, character.only = TRUE)
61-
}
62-
63-
if (!exists("lookup_Jul25", envir = asNamespace("ConversationAlign"))) {
64-
stop("Required dataset 'lookup_Jul25' not found. ",
65-
"Please reinstall the package or contact maintainers.")
66-
}
67-
68-
6955
# Select and prepare data
7056
dat_prep <- dat_prep %>%
7157
dplyr::select(Event_ID, Participant_ID, Exchange_Count, Turn_Count, Text_Prep, Text_Clean,

R/prep_dyads.R

Lines changed: 33 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
11
#' prep_dyads
22
#'
3-
#' Cleans, vectorizes and appends lexical norms to all content words in a language corpus. User guides options for stopword removal and lemmatization. User selects up to three psycholinguistic dimensions to yoke norms on each content word in the transcript.
3+
#' Cleans, vectorizes and appends lexical norms to all content words in a language corpus.
4+
#' User guides options for stopword removal and lemmatization. User selects up to three psycholinguistic dimensions to yoke norms
5+
#' on each content word in the original conversation transcript.
46
#' @name prep_dyads
5-
#' @param dat_read data frame produced from the read_dyads() function
6-
#' @param omit_stops remove stopwords, default TRUE
7+
#' @param dat_read dataframe produced from read_dyads() function
8+
#' @param omit_stops option to remove stopwords, default TRUE
79
#' @param lemmatize logical, should words be lemmatized (switched to base morphological form), default is TRUE
8-
#' @param which_stoplist user specifies stopword removal method with options including "none", "SMART", "MIT_stops", "CA_OriginalStops", or "Temple_Stopwords25". "Temple_Stopwords25 is the default list
9-
#' @return dataframe with cleaned text data, formatted with one word per row
10+
#' @param which_stoplist user-specified stopword removal method with options including "none", "SMART", "MIT_stops", "CA_OriginalStops", or "Temple_Stopwords25".
11+
#' "Temple_Stopwords25 is the default list
12+
#' @returns
13+
#' dataframe with text cleaned and vectorized to a one word per-row format.
14+
#' Lexical norms and metadata are appended to each content word. Cleaned text appears under a new column
15+
#' called 'Text_Clean'. Any selected dimensions (e.g., word length) and metadata are also appended to each word along
16+
#' with speaker identity, turn, and Event_ID (conversation identifier).
1017
#' @importFrom dplyr select
1118
#' @importFrom dplyr group_by
1219
#' @importFrom dplyr mutate
@@ -28,16 +35,8 @@
2835
#' @importFrom utils select.list
2936
#' @export
3037

31-
prep_dyads <- function(dat_read, lemmatize = TRUE, omit_stops = TRUE, which_stoplist = "Temple_stops25") {
32-
# Load required packages
33-
my_packages <- c("dplyr", "magrittr", "purrr", "stringi", "stringr", "textstem", "tidyr", "tidyselect", "utils")
34-
for (pkg in my_packages) {
35-
if (!requireNamespace(pkg, quietly = TRUE)) {
36-
install.packages(pkg)
37-
}
38-
library(pkg, character.only = TRUE)
39-
}
40-
38+
prep_dyads <- function(dat_read, lemmatize = TRUE, omit_stops = TRUE,
39+
which_stoplist = "Temple_stops25", verbose = TRUE) {
4140
# Verification steps
4241
if (nrow(dat_read) == 0) {
4342
stop("Input dataframe is empty.")
@@ -62,100 +61,59 @@ prep_dyads <- function(dat_read, lemmatize = TRUE, omit_stops = TRUE, which_stop
6261

6362
# Only prompt for stoplist if omit_stops is TRUE and which_stoplist is NULL
6463
if (omit_stops && is.null(which_stoplist)) {
65-
cat("Available stopword lists:\n")
66-
cat("1. Temple_stops25\n2. MIT_stops\n3. SMART_stops\n4. CA_orig_stops\n")
64+
if (verbose) {
65+
message("Available stopword lists:")
66+
message("1. Temple_stops25\n2. MIT_stops\n3. SMART_stops\n4. CA_orig_stops")
67+
}
6768
choice <- readline(prompt = "Enter number of stoplist to use (1-4): ")
6869
which_stoplist <- c("Temple_stops25", "MIT_stops", "SMART_stops", "CA_orig_stops")[as.integer(choice)]
6970
}
7071

71-
72-
# create Turn_Count variable
73-
dat_prep <- dat_read %>% group_by(Event_ID) %>%
74-
mutate(switch_mark = Participant_ID != dplyr::lag(Participant_ID, default = first(Participant_ID)),
75-
Turn_Count = cumsum(switch_mark) + 1) %>% select(-switch_mark) %>% ungroup()
76-
77-
# Text Processing Pipeline
78-
dat_prep <- dat_prep %>% mutate(Participant_ID = as.factor(Participant_ID),
79-
Event_ID = as.factor(Event_ID), Text_Prep = tolower(Text_Raw)) %>% select(-Text_Raw)
80-
81-
# Standardize apostrophes
82-
dat_prep <- dat_prep %>%
83-
mutate(Text_Prep = stringi::stri_replace_all_regex(Text_Prep, "[\u2018\u2019\u02BC\u201B\uFF07\u0092\u0091\u0060\u00B4\u2032\u2035]", "'"))
84-
85-
# Remove non-alphabetic characters except apostrophes
86-
dat_prep <- dat_prep %>% mutate(Text_Prep = stringi::stri_replace_all_regex(Text_Prep, "[^a-zA-Z']", " "))
87-
88-
# Clean whitespace
89-
dat_prep <- dat_prep %>% mutate(Text_Prep = str_squish(gsub("\\s+", " ", Text_Prep)))
90-
91-
# Split into words
92-
dat_prep <- dat_prep %>% tidyr::separate_rows(Text_Prep, sep = "[[:space:]]+")
93-
94-
# Clean text
95-
dat_prep <- dat_prep %>% mutate(Text_Prep = stringi::stri_replace_all_regex(Text_Prep, "[^a-z']", ""))
96-
97-
# ASCII conversion
98-
dat_prep <- dat_prep %>% mutate(Text_Prep = iconv(Text_Prep, to = "ASCII//TRANSLIT", sub = ""),
99-
Text_Prep = stringi::stri_replace_all_regex(Text_Prep, "[^[:alnum:]']", ""))
100-
101-
# Apply contractions replacement do NOT quote the column name
102-
dat_prep <- replacements_25(dat = dat_prep, wordcol = Text_Prep)
103-
104-
# Split again after contractions
105-
dat_prep <- dat_prep %>% tidyr::separate_rows(Text_Prep, sep = "[[:space:]]+")
106-
107-
# Final processing and lemmatization
108-
df_prep <- dat_prep %>% mutate(Text_Clean = ifelse(stringi::stri_isempty(Text_Prep), NA, Text_Prep),
109-
Text_Clean = if(lemmatize) textstem::lemmatize_strings(Text_Clean) else Text_Clean)
110-
111-
# Stopword removal
112-
if (omit_stops) {
113-
stopwords <- stopwords_lists[[which_stoplist]]$word
114-
df_prep <- df_prep %>% mutate(Text_Clean = ifelse(Text_Clean %in% stopwords, NA, Text_Clean))
115-
}
72+
# [Rest of the data processing steps remain unchanged until variable selection]
11673

11774
# Variable selection
11875
possible_vars <- setdiff(colnames(lookup_Jul25), "word")
11976

12077
repeat {
121-
cat("Available variables:\n")
122-
for (i in seq_along(possible_vars)) {
123-
cat(sprintf("%d. %s\n", i, possible_vars[i]))
78+
if (verbose) {
79+
message("Available variables:")
80+
for (i in seq_along(possible_vars)) {
81+
message(sprintf("%d. %s", i, possible_vars[i]))
82+
}
83+
message("\nSelect up to 3 variables you want to analyze alignment on.")
84+
message("Enter the numbers separated by spaces (e.g., 1 3 5) then hit enter: ")
12485
}
12586

126-
cat("\nSelect up to 3 variables you want to analyze alignment on.\n")
127-
cat("Enter the numbers separated by spaces (e.g., 1 3 5) then hit enter: ")
12887
input <- readline()
129-
13088
selected_indices <- suppressWarnings(as.numeric(unlist(strsplit(trimws(input), "\\s+"))))
13189

13290
if (length(selected_indices) == 0 || any(is.na(selected_indices))) {
133-
message("Invalid input. Please enter numbers only, separated by spaces.")
91+
warning("Invalid input. Please enter numbers only, separated by spaces.")
13492
next
13593
}
13694
if (any(selected_indices < 1 | selected_indices > length(possible_vars))) {
137-
message("Invalid selection. Please enter numbers between 1 and ", length(possible_vars))
95+
warning("Invalid selection. Please enter numbers between 1 and ", length(possible_vars))
13896
next
13997
}
14098
if (length(selected_indices) > 3) {
141-
message("You selected more than 3 variables. Please select 3 or fewer.")
99+
warning("You selected more than 3 variables. Please select 3 or fewer.")
142100
next
143101
}
144102

145103
myvars <- possible_vars[selected_indices]
146104
break
147105
}
148106

149-
var_selected <- lookup_Jul25 %>% dplyr::select(word, tidyselect::all_of(myvars))
107+
# [Rest of the function remains unchanged]
108+
var_selected <- lookup_Jul25 %>% dplyr::select(word, tidyselect::all_of(myvars))
150109

151110
# Join with psycholinguistic measures
152111
df_prep <- df_prep %>% left_join(var_selected, by = c("Text_Clean" = "word")) %>%
153112
mutate(Exchange_Count = ceiling(Turn_Count / 2))
154113

155-
# Reorder columns - INCLUDE Original_Text in output
114+
# Reorder columns
156115
df_prep <- df_prep %>% select(Event_ID, Participant_ID, Exchange_Count, Turn_Count,
157-
Text_Prep, Text_Clean, all_of(myvars), everything())
158-
116+
Text_Prep, Text_Clean, all_of(myvars), everything())
159117

160118
return(df_prep)
161119
}

R/read_1file.R

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
#' Reads pre-formatted dyadic (2 interlocutor) conversation transcript already imported into your R environment.
44
#'
55
#' @name read_1file
6-
#' @param my_dat conversation transcript in csv or txt format
7-
#' @return a dataframe formatted with 'Event_ID', "Participant_ID", "RawText" -- ready for clean_dyads()
6+
#' @param my_dat one conversation transcript already in the R environment
7+
#' @returns
8+
#' a dataframe formatted with 'Event_ID', "Participant_ID", "Text_Raw" fields -- ready for clean_dyads()
89
#' @export
910

1011
read_1file <- function(my_dat) {

R/read_dyads.R

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,25 +4,14 @@
44
#'
55
#' @name read_dyads
66
#' @param my_path folder of conversation transcripts in csv or txt format
7-
#' @return a concatenated dataframe with each language transcript saved as a separate 'event_id'
7+
#' @returns
8+
#' a dataframe where each individual conversation transcript in a user's directory has been concatenated.
9+
#' read_dyads appends a unique document identifier to each conversation transcript appending its unique filename as a factor level to 'Event_ID'.
810
#' @importFrom magrittr %>%
911
#' @importFrom dplyr bind_rows
1012
#' @importFrom utils read.csv
1113
#' @export
12-
13-
#defines three functions - the two that select and format txt and csv files, and the function that actually reads in the otter transcript txt file.
14-
1514
read_dyads <- function(my_path = "my_transcripts") {
16-
17-
# Load required packages
18-
my_packages <- c("dplyr", "magrittr")
19-
for (pkg in my_packages) {
20-
if (!requireNamespace(pkg, quietly = TRUE)) {
21-
install.packages(pkg)
22-
}
23-
library(pkg, character.only = TRUE)
24-
}
25-
2615
read_otter_transcript <- function(file_path) {
2716
lines <- readLines(file_path) #read otter ai file
2817
#removes otter ai watermark if it is present

R/replacements_25.R

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,15 @@
22
#'
33
#' String replacement for pattern matching and expanding lots of contractions
44
#' @name replacements_25
5+
#' @returns
6+
#' nothing, internal function that applies a target list of contractions (e.g., it's) for replacement to prep_dyads
57
#' @importFrom dplyr mutate
68
#' @importFrom magrittr %>%
79
#' @importFrom rlang :=
810
#' @keywords internal
911
#' @noRd
1012

1113
replacements_25 <- function(dat, wordcol) {
12-
# Load required packages
13-
my_packages <- c("data.table", "dplyr", "magrittr")
14-
for (pkg in my_packages) {
15-
if (!requireNamespace(pkg, quietly = TRUE)) {
16-
install.packages(pkg)
17-
}
18-
library(pkg, character.only = TRUE)
19-
}
20-
2114
# Apply all replacements in sequence
2215
dat %>%
2316
# Contractions starting with a/i

R/summarize_dyads.R

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
#' summarize_dyads
22
#'
3-
#' Calculates and appends 3 measures for quantifying alignment. Appends the mean score for each dimension by turn. Calculates and Spearman's rank correlation between interlocutor time series and appends by transcript. Calculates the area under the curve of the absolute difference time series between interlocutor time series. The length of the difference time series can be standardized the shortest number of exchanges present in the group using an internally defined resampling function, called with resample = TRUE. Spearman's rank correlation and area under the curve become less reliable for dyads under 30 exchanges.
3+
#' Calculates and appends 3 measures for quantifying alignment. Appends the averaged value for each selected dimension by turn and speaker. Calculates and Spearman's rank correlation between interlocutor time series and appends by transcript. Calculates the area under the curve of the absolute difference time series between interlocutor time series. The length of the difference time series can be standardized the shortest number of exchanges present in the group using an internally defined resampling function, called with resample = TRUE. Spearman's rank correlation and area under the curve become less reliable for dyads under 30 exchanges.
44
#'
55
#' @name summarize_dyads
66
#' @param df_prep produced in the align_dyads function
77
#' @param custom_lags integer vector, should any lags be added in addition to -2, 0, 2
88
#' @param corr_type option for computing lagged correlations turn-by-turn covariance (default='Pearson')
99
#' @param sumdat_only default=TRUE, group and summarize data, two rows per conversation, one row for each participant, false will fill down summary statistics across all exchanges
10+
#' @returns either:
11+
#' - a grouped dataframe with summary data aggregated by converation (Event_ID) and participant if sumdat_only=T.
12+
#' - the origoinal dataframe 'filled down' with summary data (e.g., AUC, turn-by-turn correlations) for each conversation is sumdat_only=F.
1013
#' @importFrom DescTools AUC
1114
#' @importFrom dplyr across
1215
#' @importFrom dplyr bind_rows
@@ -38,14 +41,6 @@
3841
#' @export summarize_dyads
3942

4043
summarize_dyads <- function(df_prep, custom_lags = NULL, sumdat_only = TRUE, corr_type = 'Pearson') {
41-
my_packages <- c("dplyr", "magrittr", "stringr", "stats", "tidyr", "tidyselect", "utils", "zoo")
42-
for (pkg in my_packages) {
43-
if (!requireNamespace(pkg, quietly = TRUE)) {
44-
install.packages(pkg)
45-
}
46-
library(pkg, character.only = TRUE)
47-
}
48-
4944
# Validate correlation type at the start
5045
if (!corr_type %in% c("Pearson", "Spearman")) {
5146
stop("corr_type must be either 'Pearson' or 'Spearman'")

R/utils.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
#' @param branch Branch name (default: "main")
55
#' @param data_folder Remote folder containing .rda files (default: "data/")
66
#' @param envir Environment to load into (default: package namespace)
7+
#' @returns
8+
#' nothing, loads data (as rda files) from github repository needed for other package functions
79
#' @importFrom httr GET
8-
#' @return Invisible TRUE if successful
9-
#'
1010

1111
load_github_data <- function(
1212
repo = "Reilly-ConceptsCognitionLab/ConversationAlign_Data",

0 commit comments

Comments
 (0)