Skip to content

Commit 6305b7e

Browse files
committed
Updated globals adding internal function object names, updated read_dyads()
1 parent aa935d6 commit 6305b7e

File tree

4 files changed

+101
-1
lines changed

4 files changed

+101
-1
lines changed

R/data.R

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#' Sample Dyadic Interview Transcript: Marc Maron and Terry Gross Radio Interview 2013
2+
#'
3+
#' Text and talker information delineated, raw transcript, multiple lines per talker
4+
#'
5+
#' @format ## "Dyad_Sample"
6+
#' A data.frame with 546 obs, 2 vars:
7+
#' \describe{
8+
#' \item{text}{text from interview}
9+
#' \item{speaker}{speaker identity}
10+
#' ...
11+
#' }
12+
"MaronGross_2013"
13+
14+

R/globals.R

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#' @importFrom utils globalVariables
2+
NULL
3+
4+
utils::globalVariables(
5+
c('word', 'CleanText', 'Participant_ID', 'Event_ID', "RawText", "clean_me", "clean_dyads", "Temple_Stops25", "MIT_Stops",
6+
"SMART", 'RawTextPrepped'
7+
))

R/read_1file.R

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#' read_1file
2+
#'
3+
#' Reads pre-formatted dyadic (2 interlocutor) conversation transcript already imported into your R environment.
4+
#'
5+
#' @name read_1file
6+
#' @param my_dat conversation transcript in csv or txt format
7+
#' @return a dataframe formatted with 'Event_ID', "Participant_ID", "RawText" -- ready for clean_dyads()
8+
#' @export
9+
10+
read_1file <- function(my_dat) {
11+
#returns name not contents of mydat
12+
object_name <- deparse(substitute(my_dat))
13+
14+
# Convert to data frame if not already
15+
if (!is.data.frame(my_dat)) {
16+
my_dat <- as.data.frame(my_dat)
17+
}
18+
19+
# Store original column names for reference
20+
original_cols <- colnames(my_dat)
21+
22+
# Standardize column names (case-insensitive)
23+
colnames(my_dat) <- tolower(colnames(my_dat))
24+
25+
# Initialize standardized columns
26+
standardized_cols <- colnames(my_dat)
27+
28+
# Participant ID detection and standardization
29+
participant_pattern <- "speaker|speaker_names_raw|participant|interlocutor|patient|person|partner|source|pid|talker"
30+
participant_idx <- grepl(participant_pattern, colnames(my_dat))
31+
if (sum(participant_idx) > 0) {
32+
standardized_cols[participant_idx] <- "Participant_ID"
33+
}
34+
35+
# RawText detection and standardization
36+
text_pattern <- "text|turn|talker|mytext|utterance|my_text"
37+
text_idx <- grepl(text_pattern, colnames(my_dat))
38+
if (sum(text_idx) > 0) {
39+
standardized_cols[text_idx] <- "RawText"
40+
}
41+
42+
# Apply standardized names
43+
colnames(my_dat) <- standardized_cols
44+
45+
# Check required columns exist
46+
required_cols <- c("Participant_ID", "RawText")
47+
missing_cols <- setdiff(required_cols, colnames(my_dat))
48+
49+
if (length(missing_cols) > 0) {
50+
stop(paste("Missing required columns:",
51+
paste(missing_cols, collapse = ", "),
52+
"\nAvailable columns:",
53+
paste(original_cols, collapse = ", "),
54+
"\nExpected participant columns should match:", participant_pattern,
55+
"\nExpected text columns should match:", text_pattern),
56+
call. = FALSE)
57+
}
58+
59+
# Add Event_ID using the object's name
60+
my_dat$Event_ID <- object_name
61+
62+
# Convert ID columns to factors
63+
id_cols <- c("Event_ID", "Participant_ID")
64+
for (col in id_cols) {
65+
if (col %in% colnames(my_dat)) {
66+
my_dat[[col]] <- as.factor(my_dat[[col]])
67+
}
68+
}
69+
70+
# Reorder columns to put standard ones first
71+
standard_cols <- c("Event_ID", "Participant_ID", "RawText")
72+
other_cols <- setdiff(colnames(my_dat), standard_cols)
73+
my_dat <- my_dat[, c(standard_cols, other_cols)]
74+
75+
return(my_dat)
76+
}

R/read_dyads.R

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ read_dyads <- function(folder_name = "my_transcripts") {
114114
any(grepl("^PID$", colnames(x_read_csv), ignore.case = T)) == TRUE) &
115115
(any(grepl("^Text$", colnames(x_read_csv), ignore.case = T)) == TRUE |
116116
any(grepl("^Turn$", colnames(x_read_csv), ignore.case = T)) == TRUE |
117+
any(grepl("^talker$", colnames(x_read_csv), ignore.case = T)) == TRUE |
118+
any(grepl("^MyText$", colnames(x_read_csv), ignore.case = T)) == TRUE |
117119
any(grepl("^Utterance$", colnames(x_read_csv), ignore.case = T)) == TRUE)) {
118120

119121
#correct the speaker and text names to our conventions
@@ -124,6 +126,7 @@ read_dyads <- function(folder_name = "my_transcripts") {
124126
grepl("person", colnames(x_read_csv), ignore.case = T) |
125127
grepl("partner", colnames(x_read_csv), ignore.case = T) |
126128
grepl("source", colnames(x_read_csv), ignore.case = T) |
129+
grepl("talker", colnames(x_read_csv), ignore.case = T) |
127130
grepl("participant", colnames(x_read_csv), ignore.case = T))] <- "Participant_ID"
128131

129132
colnames(x_read_csv)[which(grepl("Text", colnames(x_read_csv), ignore.case = T) |
@@ -135,7 +138,7 @@ read_dyads <- function(folder_name = "my_transcripts") {
135138

136139
col_check <- sum(colnames(x_read_csv) %in% c("Participant_ID", "RawText"))
137140

138-
if (col_check != 2) { #if there are less than three columns
141+
if (col_check != 2) { #if there are less than two columns
139142
stop(paste("Function is unable to process csv transcript ", #error stating missing column
140143
as.character(match(x, file_list_csv)), #also states the transcript
141144
" correctly. Make sure that each transcript includes a column marking who is producing text in each row.

0 commit comments

Comments
 (0)