From bdd0d25e4f1f35d7181a3ec2eaef6afb577c9774 Mon Sep 17 00:00:00 2001 From: mcnakhaee Date: Mon, 12 Oct 2020 21:54:53 +0200 Subject: [PATCH 1/4] update Namespace --- DESCRIPTION | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ec29b6e..6c5eda1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,7 +27,8 @@ Imports: tibble, tokenizers, utils, - stats + stats, + spelling Suggests: knitr, roxygen2, @@ -35,4 +36,4 @@ Suggests: covr Encoding: UTF-8 LazyData: yes -RoxygenNote: 6.1.1 +RoxygenNote: 7.1.0 From cd89ed10459960809f596a6ee3203d36755fe534 Mon Sep 17 00:00:00 2001 From: mcnakhaee Date: Mon, 12 Oct 2020 21:55:32 +0200 Subject: [PATCH 2/4] add n_misspelling and n_emojis --- R/count.R | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/R/count.R b/R/count.R index c7ca522..17b74ca 100644 --- a/R/count.R +++ b/R/count.R @@ -145,6 +145,23 @@ n_lowers <- function(x) { x } +n_misspelled <- function(x){ + na <- is.na(x) + if (all(na)) return(0) + x <- purrr::map_int(x, ~ purrr::pluck(dim(spelling::spell_check_text(.x)),1)) + x[na] <- NA_integer_ + x +} + +n_emojis <- function(x){ + na <- is.na(x) + if (all(na)) return(0) + m <- gregexpr("[^\x01-\x7F]",x) + x <- purrr::map_dbl(m, ~ length(.x)/2) + x[na] <- NA_integer_ + x +} + n_urls <- function(x) { na <- is.na(x) if (all(na)) return(0) @@ -222,6 +239,7 @@ to_be <- function(x) { purrr::map_int(x, ~ sum(fp %in% .x, na.rm = TRUE)) } + prepositions <- function(x) { fp <- c("about", "below", "excepting", "off", "toward", "above", "beneath", "on", "under", "across", "from", "onto", "underneath", "after", "between", @@ -271,6 +289,8 @@ prepositions <- function(x) { #' \item{\code{n_extraspaces}}{Number of times more then 1 consecutive space have been used.} #' \item{\code{n_caps}}{Number of upper case characters.} #' \item{\code{n_lowers}}{Number of lower case characters.} +#' \item{\code{n_misspelled}}{Number of misspelled words.} +#' \item{\code{n_emojis}}{Number of emojis.} #' \item{\code{n_urls}}{Number of urls.} #' \item{\code{n_uq_urls}}{Number of unique urls.} #' \item{\code{n_nonasciis}}{Number of non ascii characters.} @@ -301,6 +321,8 @@ count_functions <- list( n_extraspaces = n_extraspaces, n_caps = n_caps, n_lowers = n_lowers, + n_misspelled = n_misspelled, + n_emojis = n_emojis, n_urls = n_urls, n_uq_urls = n_uq_urls, n_nonasciis = n_nonasciis, From 8078a062212de5702be611e5021e2a85d978022e Mon Sep 17 00:00:00 2001 From: mcnakhaee Date: Mon, 12 Oct 2020 21:55:59 +0200 Subject: [PATCH 3/4] add n_misspelled and n_emojis --- R/tweet-features.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/tweet-features.R b/R/tweet-features.R index a627bd0..a731e8d 100644 --- a/R/tweet-features.R +++ b/R/tweet-features.R @@ -30,5 +30,7 @@ tweet_features <- function(text) { o$n_puncts <- n_puncts(text) o$n_capsp <- (o$n_caps + 1L) / (o$n_chars + 1L) o$n_charsperword <- (o$n_chars + 1L) / (o$n_words + 1L) + o$n_misspelled <- n_misspelled(text) + o$n_emojis <- n_emojis(text) o } From 0f5c5cc598735fc05f63580e2e245715e0cb880e Mon Sep 17 00:00:00 2001 From: mcnakhaee Date: Mon, 12 Oct 2020 21:56:17 +0200 Subject: [PATCH 4/4] add n_misspelled and n_emojis --- man/count_functions.Rd | 8 ++++++-- man/textfeatures.Rd | 22 ++++++++++++++-------- man/word_dims.Rd | 8 ++++---- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/man/count_functions.Rd b/man/count_functions.Rd index 5ac3b06..9c895bc 100644 --- a/man/count_functions.Rd +++ b/man/count_functions.Rd @@ -4,7 +4,8 @@ \name{count_functions} \alias{count_functions} \title{List of all feature counting functions} -\format{Named list of all ferature counting functions +\format{ +Named list of all ferature counting functions \describe{ \item{\code{n_words}}{Number of words.} \item{\code{n_uq_words}}{Number of unique words.} @@ -21,6 +22,8 @@ \item{\code{n_extraspaces}}{Number of times more then 1 consecutive space have been used.} \item{\code{n_caps}}{Number of upper case characters.} \item{\code{n_lowers}}{Number of lower case characters.} +\item{\code{n_misspelled}}{Number of misspelled words.} +\item{\code{n_emojis}}{Number of emojis.} \item{\code{n_urls}}{Number of urls.} \item{\code{n_uq_urls}}{Number of unique urls.} \item{\code{n_nonasciis}}{Number of non ascii characters.} @@ -33,7 +36,8 @@ \item{\code{third_person}}{Number of "third person" words.} \item{\code{to_be}}{Number of "to be" words.} \item{\code{prepositions}}{Number of preposition words.} -}} +} +} \usage{ count_functions } diff --git a/man/textfeatures.Rd b/man/textfeatures.Rd index 0cf2389..f7ef51b 100644 --- a/man/textfeatures.Rd +++ b/man/textfeatures.Rd @@ -4,8 +4,14 @@ \alias{textfeatures} \title{textfeatures} \usage{ -textfeatures(text, sentiment = TRUE, word_dims = NULL, - normalize = TRUE, newdata = NULL, verbose = TRUE) +textfeatures( + text, + sentiment = TRUE, + word_dims = NULL, + normalize = TRUE, + newdata = NULL, + verbose = TRUE +) } \arguments{ \item{text}{Input data. Should be character vector or data frame with character @@ -42,13 +48,13 @@ Extracts features from text vector. trump_tweets <- c( "#FraudNewsCNN #FNN https://t.co/WYUnHjjUjg", "TODAY WE MAKE AMERICA GREAT AGAIN!", - paste("Why would Kim Jong-un insult me by calling me \\"old,\\" when I would", - "NEVER call him \\"short and fat?\\" Oh well, I try so hard to be his", + paste("Why would Kim Jong-un insult me by calling me \"old,\" when I would", + "NEVER call him \"short and fat?\" Oh well, I try so hard to be his", "friend - and maybe someday that will happen!"), paste("Such a beautiful and important evening! The forgotten man and woman", "will never be forgotten again. We will all come together as never before"), - paste("North Korean Leader Kim Jong Un just stated that the \\"Nuclear", - "Button is on his desk at all times.\\" Will someone from his depleted and", + paste("North Korean Leader Kim Jong Un just stated that the \"Nuclear", + "Button is on his desk at all times.\" Will someone from his depleted and", "food starved regime please inform him that I too have a Nuclear Button,", "but it is a much bigger & more powerful one than his, and my Button", "works!") @@ -60,9 +66,9 @@ textfeatures(trump_tweets) ## data frame with a character vector named "text" df <- data.frame( id = c(1, 2, 3), - text = c("this is A!\\t sEntence https://github.com about #rstats @github", + text = c("this is A!\t sEntence https://github.com about #rstats @github", "and another sentence here", - "The following list:\\n- one\\n- two\\n- three\\nOkay!?!"), + "The following list:\n- one\n- two\n- three\nOkay!?!"), stringsAsFactors = FALSE ) diff --git a/man/word_dims.Rd b/man/word_dims.Rd index 1ea1b87..48425cf 100644 --- a/man/word_dims.Rd +++ b/man/word_dims.Rd @@ -29,13 +29,13 @@ Calculates word2vec dimension estimates trump_tweets <- c( "#FraudNewsCNN #FNN https://t.co/WYUnHjjUjg", "TODAY WE MAKE AMERICA GREAT AGAIN!", -paste("Why would Kim Jong-un insult me by calling me \\"old,\\" when I would", - "NEVER call him \\"short and fat?\\" Oh well, I try so hard to be his", +paste("Why would Kim Jong-un insult me by calling me \"old,\" when I would", + "NEVER call him \"short and fat?\" Oh well, I try so hard to be his", "friend - and maybe someday that will happen!"), paste("Such a beautiful and important evening! The forgotten man and woman", "will never be forgotten again. We will all come together as never before"), -paste("North Korean Leader Kim Jong Un just stated that the \\"Nuclear", - "Button is on his desk at all times.\\" Will someone from his depleted and", +paste("North Korean Leader Kim Jong Un just stated that the \"Nuclear", + "Button is on his desk at all times.\" Will someone from his depleted and", "food starved regime please inform him that I too have a Nuclear Button,", "but it is a much bigger & more powerful one than his, and my Button", "works!")