Skip to content

Commit 9722505

Browse files
author
Youzhi Yu
committed
added data and function
1 parent eee0784 commit 9722505

13 files changed

+269
-22
lines changed

DESCRIPTION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,6 @@ Imports:
1515
dplyr,
1616
emoji,
1717
purrr,
18-
rlang,
1918
stringr,
20-
tibble
19+
tibble,
20+
utils

NAMESPACE

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
# Generated by roxygen2: do not edit by hand
22

33
export(emoji_summary)
4+
export(emoji_tweets)
5+
export(top_n_emojis)
46
import(dplyr)
57
import(emoji)
6-
import(rlang)
8+
import(purrr)
79
import(stringr)
810
import(tibble)
9-
importFrom(dplyr,"%>%")
10-
importFrom(purrr,"%||%")
11+
import(utils)

R/data.R

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#' Emoji name, Unicode, and Emoji category crosswalk
2+
#'
3+
#' A data set containing each Emoji name (such as grinning, smile), its
4+
#' respective Unicode and category. One thing to note here is there are
5+
#' duplicated Unicodes in the data set, because one Unicode could have multiple
6+
#' Emoji names.
7+
#'
8+
#' @format A data frame with 4536 rows and 3 columns:
9+
#' \describe{
10+
#' \item{emoji_name}{The name of Emoji per se.}
11+
#' \item{unicode}{The Unicode of Emoji.}
12+
#' \item{emoji_category}{The category Emoji falls into.}
13+
#' }
14+
#' @source The raw data sets (\code{emoji_name} and \code{emojis}) come from the
15+
#' \code{emoji} package, and they are processed by the author for the specific
16+
#' needs of \code{tidyEmoji}.
17+
"emoji_unicode_crosswalk"
18+
19+
20+
21+
#' Emoji category, Unicode crosswalk
22+
#'
23+
#' A data set containing each Emoji category (such as Activities), its
24+
#' respective Unicodes string separated by \code{|}.
25+
#'
26+
#' @format A data frame with 10 rows and 2 columns:
27+
#' \describe{
28+
#' \item{category}{Emoji category (10 categories only)}
29+
#' \item{unicodes}{The Unicodes string of Emojis belonging to category per
30+
#' se.}
31+
#' }
32+
#' @source The raw data set \code{emojis} comes from the
33+
#' \code{emoji} package, and it is processed by the author for the specific
34+
#' needs of \code{tidyEmoji}.
35+
"category_unicode_crosswalk"
36+
37+

R/emoji-summary.R

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,22 @@
1-
#' Emoji Summary Tibble
1+
#' Emoji summary tibble
22
#'
3-
#' @param tweet_tbl A dataframe/tibble containing tweets.
4-
#' @param tweet_text The column that is the tweet column.
3+
#' When having a Twitter dataframe/tibble at hand, it should be nice to know
4+
#' how many Tweets contain Emojis. This is the right time to use this function.
5+
#' What is worth noting is that it does not matter whether a Tweet has one Emoji
6+
#' or ten Emojis, the function only counts it once and returns a tibble that
7+
#' summarizes the number of Tweets containing at least one Emoji and the total
8+
#' number of Tweets presented in the dataframe/tibble.
59
#'
6-
#' @return A summary tibble including # of tweets in total and # of tweets that
10+
#' @param tweet_tbl A dataframe/tibble containing tweets/text.
11+
#' @param tweet_text The tweet/text column.
12+
#'
13+
#' @return A summary tibble including # of Tweets in total and # of Tweets that
714
#' have at least one Emoji.
815
#'
916
#' @import dplyr
1017
#' @import emoji
1118
#' @import stringr
1219
#' @import tibble
13-
#' @import rlang
1420
#' @export
1521
#'
1622

@@ -21,14 +27,40 @@ emoji_summary <- function(tweet_tbl, tweet_text){
2127

2228
num_emoji_tweets <- tweet_tbl %>%
2329
dplyr::filter(stringr::str_detect({{ tweet_text }},
24-
emoji::emojis %>%
25-
dplyr::filter(!stringr::str_detect(name, "keycap: \\*")) %>%
26-
dplyr::pull(emoji) %>%
27-
paste(., collapse = "|"))) %>%
30+
emoji::emojis %>%
31+
dplyr::filter(!stringr::str_detect(name, "keycap: \\*")) %>%
32+
dplyr::pull(emoji) %>%
33+
paste(., collapse = "|"))) %>%
2834
dim() %>%
2935
.[1]
3036

3137
return(tibble::tibble(emoji_tweets = num_emoji_tweets,
3238
total_tweets = num_tweets))
3339

3440
}
41+
42+
43+
44+
45+
46+
#' Emoji Text/Tweets Output
47+
#'
48+
#' When users just want to focus on Tweets containing Emoji(s),
49+
#' \code{emoji_tweets} filters out non-Emoji rows and only returns rows that
50+
#' have at least one Emoji.
51+
#'
52+
#' @inheritParams emoji_summary
53+
#'
54+
#' @return A dataframe/tibble containing only text with at least one Emoji
55+
#' @export
56+
57+
emoji_tweets <- function(tweet_tbl, tweet_text){
58+
59+
tweet_tbl %>%
60+
dplyr::filter(stringr::str_detect({{ tweet_text }},
61+
emoji::emojis %>%
62+
dplyr::filter(!stringr::str_detect(name, "keycap: \\*")) %>%
63+
dplyr::pull(emoji) %>%
64+
paste(., collapse = "|")))
65+
66+
}

R/tidyEmoji.R

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,13 @@
44
#'
55
#' @docType package
66
#' @name tidyEmoji
7-
#' @importFrom dplyr %>%
8-
#' @importFrom purrr %||%
7+
#' @import utils
98
NULL
109

1110
## quiets concerns of R CMD check re: the .'s that appear in pipelines
12-
if(getRversion() >= "2.15.1") utils::globalVariables(c(".", "name"))
11+
if(getRversion() >= "2.15.1") utils::globalVariables(c(".",
12+
"name",
13+
"emoji_name",
14+
"unicode",
15+
"emoji_category",
16+
"emoji_unicode_crosswalk"))

R/top-n-emojis.R

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#' Getting n most popular Emojis
2+
#'
3+
#' When working with Tweets, counting how many times each Emoji appears in the
4+
#' entire Tweet corpus is useful. This is when \code{top_n_emojis} comes into
5+
#' play, and it is handy to see how Emojis are distributed across the corpus.
6+
#' If a Tweet has 10 Emojis, \code{top_n_emojis} will count it 10 times and
7+
#' assign each of the 10 Emojis on its respective Emoji category. What is
8+
#' interesting to note is Unicodes returned by \code{top_n_emojis} could have
9+
#' duplicates, meaning some Unicodes share various Emoji names. By default, this
10+
#' does not happen, but users can choose \code{duplicated_unicode = 'yes'} to
11+
#' obtain duplicated Unicodes.
12+
#'
13+
#' @inheritParams emoji_summary
14+
#' @param n Top \code{n} Emojis, default is 20.
15+
#' @param duplicated_unicode If no repetitious Unicode, \code{no}. Otherwise,
16+
#' \code{yes}. Default is \code{no}.
17+
#' @return A tibble with top \code{n} Emojis
18+
#' @import tibble
19+
#' @import purrr
20+
#' @import dplyr
21+
#' @export
22+
#'
23+
24+
25+
top_n_emojis <- function(tweet_tbl, tweet_text, n = 20, duplicated_unicode = "no"){
26+
27+
emoji_tbl <- emoji_tweets(tweet_tbl, {{ tweet_text }})
28+
29+
emoji_count_list <- purrr::map(emoji_unicode_crosswalk$unicode,
30+
.f = count_each_emoji,
31+
emoji_tbl,
32+
{{ tweet_text }})
33+
34+
tbl <- tibble::tibble(unicode = emoji_unicode_crosswalk$unicode,
35+
emoji_count = unlist(emoji_count_list)) %>%
36+
dplyr::inner_join(emoji_unicode_crosswalk, by = "unicode") %>%
37+
dplyr::distinct() %>%
38+
dplyr::count(emoji_name, unicode, emoji_category, wt = emoji_count, sort = T)
39+
40+
if(duplicated_unicode == "no"){
41+
tbl %>%
42+
distinct(unicode, .keep_all = T) %>%
43+
head(n)
44+
}
45+
else if(duplicated_unicode == "yes"){
46+
tbl %>%
47+
head(n)
48+
}
49+
50+
51+
}
52+
53+
54+
count_each_emoji <- function(unicode, df, tweet_text){
55+
return(df %>%
56+
dplyr::pull({{ tweet_text }}) %>%
57+
stringr::str_count(., unicode) %>%
58+
sum())
59+
}
13.2 KB
Binary file not shown.

data/emoji_unicode_crosswalk.rda

38.3 KB
Binary file not shown.

man/category_unicode_crosswalk.Rd

Lines changed: 27 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/emoji_summary.Rd

Lines changed: 10 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)