PursuitOfDataScience
diff --git a/‎DESCRIPTION‎
Lines changed: 2 additions & 2 deletions b/‎DESCRIPTION‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 4 additions & 3 deletions b/‎NAMESPACE‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎R/data.R‎
Lines changed: 37 additions & 0 deletions b/‎R/data.R‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎R/emoji-summary.R‎
Lines changed: 41 additions & 9 deletions b/‎R/emoji-summary.R‎
Lines changed: 41 additions & 9 deletions
diff --git a/‎R/tidyEmoji.R‎
Lines changed: 7 additions & 3 deletions b/‎R/tidyEmoji.R‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎R/top-n-emojis.R‎
Lines changed: 59 additions & 0 deletions b/‎R/top-n-emojis.R‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎data/category_unicode_crosswalk.rda‎
13.2 KB b/‎data/category_unicode_crosswalk.rda‎
13.2 KB
diff --git a/‎data/emoji_unicode_crosswalk.rda‎
38.3 KB b/‎data/emoji_unicode_crosswalk.rda‎
38.3 KB
diff --git a/‎man/category_unicode_crosswalk.Rd‎
Lines changed: 27 additions & 0 deletions b/‎man/category_unicode_crosswalk.Rd‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎man/emoji_summary.Rd‎
Lines changed: 10 additions & 5 deletions b/‎man/emoji_summary.Rd‎
Lines changed: 10 additions & 5 deletions
@@ -15,6 +15,6 @@ Imports:
     dplyr,
     emoji,
     purrr,
-    rlang,
     stringr,
-    tibble
+    tibble,
+    utils
@@ -1,10 +1,11 @@
 # Generated by roxygen2: do not edit by hand
 
 export(emoji_summary)
+export(emoji_tweets)
+export(top_n_emojis)
 import(dplyr)
 import(emoji)
-import(rlang)
+import(purrr)
 import(stringr)
 import(tibble)
-importFrom(dplyr,"%>%")
-importFrom(purrr,"%||%")
+import(utils)
@@ -0,0 +1,37 @@
+#' Emoji name, Unicode, and Emoji category crosswalk
+#'
+#' A data set containing each Emoji name (such as grinning, smile), its
+#' respective Unicode and category. One thing to note here is there are
+#' duplicated Unicodes in the data set, because one Unicode could have multiple
+#' Emoji names.
+#'
+#' @format A data frame with 4536 rows and 3 columns:
+#' \describe{
+#'   \item{emoji_name}{The name of Emoji per se.}
+#'   \item{unicode}{The Unicode of Emoji.}
+#'   \item{emoji_category}{The category Emoji falls into.}
+#' }
+#' @source The raw data sets (\code{emoji_name} and \code{emojis}) come from the
+#' \code{emoji} package, and they are processed by the author for the specific
+#' needs of \code{tidyEmoji}.
+"emoji_unicode_crosswalk"
+
+
+
+#' Emoji category, Unicode crosswalk
+#'
+#' A data set containing each Emoji category (such as Activities), its
+#' respective Unicodes string separated by \code{|}.
+#'
+#' @format A data frame with 10 rows and 2 columns:
+#' \describe{
+#'   \item{category}{Emoji category (10 categories only)}
+#'   \item{unicodes}{The Unicodes string of Emojis belonging to category per
+#'   se.}
+#' }
+#' @source The raw data set \code{emojis} comes from the
+#' \code{emoji} package, and it is processed by the author for the specific
+#' needs of \code{tidyEmoji}.
+"category_unicode_crosswalk"
+
+
@@ -1,16 +1,22 @@
-#' Emoji Summary Tibble
+#' Emoji summary tibble
 #'
-#' @param tweet_tbl A dataframe/tibble containing tweets.
-#' @param tweet_text The column that is the tweet column.
+#' When having a Twitter dataframe/tibble at hand, it should be nice to know
+#' how many Tweets contain Emojis. This is the right time to use this function.
+#' What is worth noting is that it does not matter whether a Tweet has one Emoji
+#' or ten Emojis, the function only counts it once and returns a tibble that
+#' summarizes the number of Tweets containing at least one Emoji and the total
+#' number of Tweets presented in the dataframe/tibble.
 #'
-#' @return A summary tibble including # of tweets in total and # of tweets that
+#' @param tweet_tbl A dataframe/tibble containing tweets/text.
+#' @param tweet_text The tweet/text column.
+#'
+#' @return A summary tibble including # of Tweets in total and # of Tweets that
 #' have at least one Emoji.
 #'
 #' @import dplyr
 #' @import emoji
 #' @import stringr
 #' @import tibble
-#' @import rlang
 #' @export
 #'
 
@@ -21,14 +27,40 @@ emoji_summary <- function(tweet_tbl, tweet_text){
 
   num_emoji_tweets <- tweet_tbl %>%
     dplyr::filter(stringr::str_detect({{ tweet_text }},
-                      emoji::emojis %>%
-                        dplyr::filter(!stringr::str_detect(name, "keycap: \\*")) %>%
-                        dplyr::pull(emoji) %>%
-                        paste(., collapse = "|"))) %>%
+                                      emoji::emojis %>%
+                                        dplyr::filter(!stringr::str_detect(name, "keycap: \\*")) %>%
+                                        dplyr::pull(emoji) %>%
+                                        paste(., collapse = "|"))) %>%
     dim() %>%
     .[1]
 
   return(tibble::tibble(emoji_tweets = num_emoji_tweets,
                         total_tweets = num_tweets))
 
 }
+
+
+
+
+
+#' Emoji Text/Tweets Output
+#'
+#' When users just want to focus on Tweets containing Emoji(s),
+#' \code{emoji_tweets} filters out non-Emoji rows and only returns rows that
+#' have at least one Emoji.
+#'
+#' @inheritParams emoji_summary
+#'
+#' @return A dataframe/tibble containing only text with at least one Emoji
+#' @export
+
+emoji_tweets <- function(tweet_tbl, tweet_text){
+
+  tweet_tbl %>%
+    dplyr::filter(stringr::str_detect({{ tweet_text }},
+                                      emoji::emojis %>%
+                                        dplyr::filter(!stringr::str_detect(name, "keycap: \\*")) %>%
+                                        dplyr::pull(emoji) %>%
+                                        paste(., collapse = "|")))
+
+}
@@ -4,9 +4,13 @@
 #'
 #' @docType package
 #' @name tidyEmoji
-#' @importFrom dplyr %>%
-#' @importFrom purrr %||%
+#' @import utils
 NULL
 
 ## quiets concerns of R CMD check re: the .'s that appear in pipelines
-if(getRversion() >= "2.15.1")  utils::globalVariables(c(".", "name"))
+if(getRversion() >= "2.15.1")  utils::globalVariables(c(".",
+                                                        "name",
+                                                        "emoji_name",
+                                                        "unicode",
+                                                        "emoji_category",
+                                                        "emoji_unicode_crosswalk"))
@@ -0,0 +1,59 @@
+#' Getting n most popular Emojis
+#'
+#' When working with Tweets, counting how many times each Emoji appears in the
+#' entire Tweet corpus is useful. This is when \code{top_n_emojis} comes into
+#' play, and it is handy to see how Emojis are distributed across the corpus.
+#' If a Tweet has 10 Emojis, \code{top_n_emojis} will count it 10 times and
+#' assign each of the 10 Emojis on its respective Emoji category. What is
+#' interesting to note is Unicodes returned by \code{top_n_emojis} could have
+#' duplicates, meaning some Unicodes share various Emoji names. By default, this
+#' does not happen, but users can choose \code{duplicated_unicode = 'yes'} to
+#' obtain duplicated Unicodes.
+#'
+#' @inheritParams emoji_summary
+#' @param n Top \code{n} Emojis, default is 20.
+#' @param duplicated_unicode If no repetitious Unicode, \code{no}. Otherwise,
+#' \code{yes}. Default is \code{no}.
+#' @return A tibble with top \code{n} Emojis
+#' @import tibble
+#' @import purrr
+#' @import dplyr
+#' @export
+#'
+
+
+top_n_emojis <- function(tweet_tbl, tweet_text, n = 20, duplicated_unicode = "no"){
+
+  emoji_tbl <- emoji_tweets(tweet_tbl, {{ tweet_text }})
+
+  emoji_count_list <- purrr::map(emoji_unicode_crosswalk$unicode,
+                                 .f = count_each_emoji,
+                                 emoji_tbl,
+                                 {{ tweet_text }})
+
+  tbl <- tibble::tibble(unicode = emoji_unicode_crosswalk$unicode,
+                        emoji_count = unlist(emoji_count_list)) %>%
+    dplyr::inner_join(emoji_unicode_crosswalk, by = "unicode") %>%
+    dplyr::distinct() %>%
+    dplyr::count(emoji_name, unicode, emoji_category, wt = emoji_count, sort = T)
+
+  if(duplicated_unicode == "no"){
+    tbl %>%
+      distinct(unicode, .keep_all = T) %>%
+      head(n)
+  }
+  else if(duplicated_unicode == "yes"){
+    tbl %>%
+      head(n)
+  }
+
+
+}
+
+
+count_each_emoji <- function(unicode, df, tweet_text){
+  return(df %>%
+           dplyr::pull({{ tweet_text }}) %>%
+           stringr::str_count(., unicode) %>%
+           sum())
+}
-Original file line number
+Diff line change
     dplyr,
     emoji,
     purrr,
 -    rlang,
     stringr,
 -    tibble
 +    tibble,
 +    utils