Updated globals adding internal function object names, updated read_dyads()

reilly-lab · reilly-lab · commit 6305b7e371c7 · 2025-06-08T18:58:13.000-04:00
diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,14 @@
+#' Sample Dyadic Interview Transcript: Marc Maron and Terry Gross Radio Interview 2013
+#'
+#' Text and talker information delineated, raw transcript, multiple lines per talker
+#'
+#' @format ## "Dyad_Sample"
+#' A data.frame with 546 obs, 2 vars:
+#' \describe{
+#'   \item{text}{text from interview}
+#'    \item{speaker}{speaker identity}
+#'   ...
+#' }
+"MaronGross_2013"
+
+
diff --git a/R/globals.R b/R/globals.R
@@ -0,0 +1,7 @@
+#' @importFrom utils globalVariables
+NULL
+
+utils::globalVariables(
+  c('word', 'CleanText', 'Participant_ID', 'Event_ID', "RawText", "clean_me", "clean_dyads", "Temple_Stops25", "MIT_Stops",
+    "SMART", 'RawTextPrepped'
+  ))
diff --git a/R/read_1file.R b/R/read_1file.R
@@ -0,0 +1,76 @@
+#' read_1file
+#'
+#' Reads pre-formatted dyadic (2 interlocutor) conversation transcript already imported into your R environment.
+#'
+#' @name read_1file
+#' @param my_dat conversation transcript in csv or txt format
+#' @return a dataframe formatted with 'Event_ID', "Participant_ID", "RawText" -- ready for clean_dyads()
+#' @export
+
+read_1file <- function(my_dat) {
+  #returns name not contents of mydat
+  object_name <- deparse(substitute(my_dat))
+
+  # Convert to data frame if not already
+  if (!is.data.frame(my_dat)) {
+    my_dat <- as.data.frame(my_dat)
+  }
+
+  # Store original column names for reference
+  original_cols <- colnames(my_dat)
+
+  # Standardize column names (case-insensitive)
+  colnames(my_dat) <- tolower(colnames(my_dat))
+
+  # Initialize standardized columns
+  standardized_cols <- colnames(my_dat)
+
+  # Participant ID detection and standardization
+  participant_pattern <- "speaker|speaker_names_raw|participant|interlocutor|patient|person|partner|source|pid|talker"
+  participant_idx <- grepl(participant_pattern, colnames(my_dat))
+  if (sum(participant_idx) > 0) {
+    standardized_cols[participant_idx] <- "Participant_ID"
+  }
+
+  # RawText detection and standardization
+  text_pattern <- "text|turn|talker|mytext|utterance|my_text"
+  text_idx <- grepl(text_pattern, colnames(my_dat))
+  if (sum(text_idx) > 0) {
+    standardized_cols[text_idx] <- "RawText"
+  }
+
+  # Apply standardized names
+  colnames(my_dat) <- standardized_cols
+
+  # Check required columns exist
+  required_cols <- c("Participant_ID", "RawText")
+  missing_cols <- setdiff(required_cols, colnames(my_dat))
+
+  if (length(missing_cols) > 0) {
+    stop(paste("Missing required columns:",
+               paste(missing_cols, collapse = ", "),
+               "\nAvailable columns:",
+               paste(original_cols, collapse = ", "),
+               "\nExpected participant columns should match:", participant_pattern,
+               "\nExpected text columns should match:", text_pattern),
+         call. = FALSE)
+  }
+
+  # Add Event_ID using the object's name
+  my_dat$Event_ID <- object_name
+
+  # Convert ID columns to factors
+  id_cols <- c("Event_ID", "Participant_ID")
+  for (col in id_cols) {
+    if (col %in% colnames(my_dat)) {
+      my_dat[[col]] <- as.factor(my_dat[[col]])
+    }
+  }
+
+  # Reorder columns to put standard ones first
+  standard_cols <- c("Event_ID", "Participant_ID", "RawText")
+  other_cols <- setdiff(colnames(my_dat), standard_cols)
+  my_dat <- my_dat[, c(standard_cols, other_cols)]
+
+  return(my_dat)
+}
diff --git a/R/read_dyads.R b/R/read_dyads.R
@@ -114,6 +114,8 @@ read_dyads <- function(folder_name = "my_transcripts") {
              any(grepl("^PID$", colnames(x_read_csv), ignore.case = T)) == TRUE) &
             (any(grepl("^Text$", colnames(x_read_csv), ignore.case = T)) == TRUE |
              any(grepl("^Turn$", colnames(x_read_csv), ignore.case = T)) == TRUE |
+             any(grepl("^talker$", colnames(x_read_csv), ignore.case = T)) == TRUE |
+             any(grepl("^MyText$", colnames(x_read_csv), ignore.case = T)) == TRUE |
              any(grepl("^Utterance$", colnames(x_read_csv), ignore.case = T)) == TRUE)) {
 
           #correct the speaker and text names to our conventions
@@ -124,6 +126,7 @@ read_dyads <- function(folder_name = "my_transcripts") {
                                        grepl("person", colnames(x_read_csv), ignore.case = T) |
                                        grepl("partner", colnames(x_read_csv), ignore.case = T) |
                                        grepl("source", colnames(x_read_csv), ignore.case = T) |
+                                       grepl("talker", colnames(x_read_csv), ignore.case = T) |
                                        grepl("participant", colnames(x_read_csv), ignore.case = T))] <- "Participant_ID"
 
           colnames(x_read_csv)[which(grepl("Text", colnames(x_read_csv), ignore.case = T) |
@@ -135,7 +138,7 @@ read_dyads <- function(folder_name = "my_transcripts") {
 
         col_check <- sum(colnames(x_read_csv) %in% c("Participant_ID", "RawText"))
 
-        if (col_check != 2) { #if there are less than three columns
+        if (col_check != 2) { #if there are less than two columns
           stop(paste("Function is unable to process csv transcript ", #error stating missing column
                      as.character(match(x, file_list_csv)), #also states the transcript
                      " correctly. Make sure that each transcript includes a column marking who is producing text in each row.