Skip to content

Commit 2f73cfb

Browse files
committed
Accept zip files & update emoji dictionary
1 parent 49d269c commit 2f73cfb

File tree

9 files changed

+5262
-3343
lines changed

9 files changed

+5262
-3343
lines changed

R/download_emoji.R

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222

2323
# Function to scrape an emoji dictionary from https://www.unicode.org/
24-
download_emoji <- function(unicode_page = "https://www.unicode.org/Public/emoji/15.1/emoji-test.txt",
24+
download_emoji <- function(unicode_page = "https://www.unicode.org/Public/draft/emoji/emoji-test.txt",
2525
delete_header = 32,
2626
nlines = -1L) {
2727

@@ -78,7 +78,10 @@ download_emoji <- function(unicode_page = "https://www.unicode.org/Public/emoji/
7878
description <- gsub(" ","_",description)
7979

8080
# Combining into data frame
81-
EmojiDF <- data.frame(R.native = emoji,Desc = description)
81+
EmojiDF <- data.frame(R.native = emoji,
82+
Desc = description,
83+
Bytestring = hex_codepoints,
84+
status = status)
8285

8386
# saving original order
8487
EmojiDF$OriginalOrder <- as.numeric(rownames(EmojiDF))
@@ -89,10 +92,9 @@ download_emoji <- function(unicode_page = "https://www.unicode.org/Public/emoji/
8992
# Combining manually added emoji with the rest
9093
EmojiDF <- rbind.data.frame(EmojiDF,ManAdd[3:4,])
9194

92-
# Matching the keycap exceptions
93-
# TODO: This doesn't work when not using the Full document with nlines!
94-
EmojiDF[c(4648,4649),] <- ManAdd[1:2,]
95-
EmojiDF$OriginalOrder[4648:4649] <- c(4648,4649)
95+
# Adding the WhatsApp special Emoji manually
96+
EmojiDF[EmojiDF$Bytestring == "0023 20E3",] <- ManAdd[1,]
97+
EmojiDF[EmojiDF$Bytestring == "0023 FE0F 20E3",] <- ManAdd[2,]
9698

9799
# ordering from longest to shortest (prevents partial matching of shorter strings further down the line)
98100
EmojiDF <- EmojiDF[rev(order(nchar(as.character(EmojiDF$R.native)))), ]

R/parse_chat.R

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
#' @description Creates a data frame from an exported 'WhatsApp' chat log containing one row per message. Some columns
44
#' are saved as lists using the I() function so that multiple elements can be stored per message while still maintaining
55
#' the general structure of one row per message. These columns should be treated as lists or unlisted first.
6-
#' @param path Character string containing the file path to the exported 'WhatsApp' chat log as a .txt file.
7-
#' @param os Operating system of the phone the chat was exported from. Default "auto" tries to automatically detect the OS. Also supports "android" or "iOS".
6+
#' @param path Character string containing the file path to the exported 'WhatsApp' chat log as a .txt file or .zip folder.
7+
#' @param os Operating system of the phone the chat was exported from. Default "auto" tries to automatically detect the OS. Also supports "android" or "ios".
88
#' @param language Indicates the language setting of the phone with which the messages were exported. Default is "auto" trying to match either 'English' or 'German'. More languages might be supported in the future.
99
#' @param anonymize TRUE results in the vector of sender names being anonymized and columns containing personal identifiable information to be deleted or restricted, FALSE displays the actual names and all content, "add" adds
1010
#' anonomized columns to the full info columns. Do not blindly trust this and always double check.
@@ -42,15 +42,30 @@ parse_chat <- function(path,
4242

4343
# Input checking
4444
if (!file.exists(path)) {stop("'path' must be a valid file path to an exported 'WhatsApp' chatlog in .txt format")}
45-
if (!(os == "auto" | os == "android" | os == "android")) {stop("'os' must either be 'android','ios', or 'auto'")}
45+
if (!(os == "auto" | os == "android" | os == "ios")) {stop("'os' must either be 'android','ios', or 'auto'")}
4646
if (!(language == "auto" | language == "english" | language == "german")) {stop("'language' must be either 'english', 'german', or 'auto'")}
4747
if (!(is.logical(anonymize) | anonymize == "add")) {stop("'anonymize' must be either TRUE, FALSE, or 'add'")}
48-
if (!(is.character(consent) | is.na(consent))) {stop("'consent' must bei either NA or a character vector")}
48+
if (!(is.character(consent) | is.na(consent))) {stop("'consent' must be either NA or a character vector")}
4949
if (!(emoji_dictionary == "internal" | file.exists(emoji_dictionary))) {stop("'emoji_dictionary' must be 'internal' or valid path to a dictionary scraped using download_emoji()")}# TODO
5050
if (!(smilie_dictionary == "emoticons" | smilie_dictionary == "wikipedia")) {stop("'smilie_dictionary' must be 'emoticons' or 'wikipedia'")}
5151
if (!is.character(rpnl)) {stop("'rpnl' must be a character string")}
5252
if (!is.logical(verbose)) {stop("'verbose' must be either TRUE or FALSE")}
5353

54+
# accept .txt or .zip (containing one or more .txt)
55+
if (!file.exists(path)) stop("'path' must be a valid path to a .txt or .zip file with a WhatsApp chat export")
56+
if (grepl("\\.zip$", path, ignore.case = TRUE)) {
57+
z <- utils::unzip(path, list = TRUE)
58+
txt <- z$Name[grepl("\\.txt$", z$Name, ignore.case = TRUE)]
59+
if (!length(txt)) stop("No .txt found inside the .zip export.")
60+
tmpdir <- file.path(tempdir(), "whatsr_zip")
61+
utils::unzip(path, files = txt, exdir = tmpdir, overwrite = TRUE)
62+
# If multiple txts exist, choose the largest by size (usually the chat)
63+
files <- file.path(tmpdir, txt)
64+
sizes <- file.info(files)$size
65+
path <- files[which.max(sizes)]
66+
if (verbose) cat(sprintf("Detected chat log file: %s\n", basename(path)))
67+
}
68+
5469
# Importing raw chat file
5570
RawChat <- readChar(path, file.info(path)$size)
5671

0 commit comments

Comments
 (0)