Skip to content

Commit 288027d

Browse files
committed
Rename score -> classify
Improved upon Naive Bayes Classifier as well.
1 parent d81e60e commit 288027d

File tree

2 files changed

+175
-118
lines changed

2 files changed

+175
-118
lines changed

R/classify.R

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
#simple aggregation function with regex driven gsub
2+
classify.aggregate = function(sentences, pos.words, neg.words, .progress='none')
3+
{
4+
require(plyr)
5+
require(stringr)
6+
7+
# array ("a") of scores back = "l" + "a" + "ply" = "laply":
8+
scores = laply(sentences, function(sentence, pos.words, neg.words) {
9+
10+
sentence = gsub('[[:punct:]]', '', sentence)
11+
sentence = gsub('[[:cntrl:]]', '', sentence)
12+
sentence = gsub("[[:digit:]]", "", sentence)
13+
sentence = gsub('\\d+', '', sentence)
14+
sentence = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", sentence)
15+
sentence = gsub("@\\w+", "", sentence)
16+
sentence = gsub("http\\w+", "", sentence)
17+
sentence = gsub("[ \t]{2,}", "", sentence)
18+
sentence = gsub("^\\s+|\\s+$", "", sentence)
19+
20+
# define "tolower error handling" function
21+
try.error = function(x)
22+
{
23+
y = NA
24+
try_error = tryCatch(tolower(x), error=function(e) e)
25+
26+
# if not an error
27+
if (!inherits(try_error, "error"))
28+
y = tolower(x)
29+
30+
return(y)
31+
}
32+
33+
sentence = sapply(sentence, try.error)
34+
sentence = sentence[!is.na(sentence)]
35+
names(sentence) = NULL
36+
37+
word.list = str_split(sentence, '\\s+')
38+
words = unlist(word.list)
39+
40+
pos.matches = match(words, pos.words)
41+
neg.matches = match(words, neg.words)
42+
43+
# match() returns the position of the matched term or NA (we only want a boolean value)
44+
# which is fixed by sum
45+
pos.matches = !is.na(pos.matches)
46+
neg.matches = !is.na(neg.matches)
47+
48+
score = sum(pos.matches) - sum(neg.matches)
49+
50+
return(score)
51+
}, pos.words, neg.words, .progress=.progress)
52+
53+
# returns a data frame
54+
scores.df = data.frame(score=scores, text=sentences)
55+
return(scores.df)
56+
}
57+
58+
# Uses ViralHeat's API to calculate the sentiment of a given piece of text.
59+
# Note that maximum number of characters is 360
60+
classify.viralheat = function (text, api.key) {
61+
library(RCurl);
62+
library(RJSONIO);
63+
64+
text <- URLencode(text);
65+
text <- str_replace_all(text, "%20", " ");
66+
text <- str_replace_all(text, "%\\d\\d", "");
67+
text <- str_replace_all(text, " ", "%20");
68+
69+
if (str_length(text) > 360){
70+
text <- substr(text, 0, 359);
71+
}
72+
73+
data <- getURL(paste("http://www.viralheat.com/api/sentiment/review.json?text=",text,"&api_key=", api.key, sep=""))
74+
75+
js <- fromJSON(data, asText=TRUE);
76+
77+
sent <- js$prob;
78+
79+
j <<- js;
80+
81+
if (js$mood == "negative") {
82+
sent <- sent * -1;
83+
}
84+
else {
85+
if (js$mood != "positive") {
86+
#must be neutral
87+
sent <- 0;
88+
}
89+
}
90+
91+
return(sent);
92+
}
93+
94+
classify.naivebayes <- function(textColumns, pstrong=0.5,
95+
pweak=1.0, prior=1.0, ...) {
96+
97+
matrix <- classify.dtm(textColumns, ...)
98+
fpath <- system.file("extdata", "subjectivity.csv", package="sentR")
99+
lexicon <- read.csv(fpath, head = FALSE)
100+
101+
counts <- list(positive = length(which(lexicon[,3]=="positive")),
102+
negative = length(which(lexicon[,3]=="negative")),
103+
total = nrow(lexicon))
104+
105+
documents <- c()
106+
107+
for (i in 1:nrow(matrix)) {
108+
scores <- list(positive=0, negative=0)
109+
doc <- matrix[i,]
110+
words <- findFreqTerms(doc, lowfreq=1)
111+
112+
for (word in words) {
113+
index <- pmatch(word,lexicon[,1],nomatch=0)
114+
if (index > 0) {
115+
entry <- lexicon[index,]
116+
117+
polarity <- as.character(entry[[2]])
118+
category <- as.character(entry[[3]])
119+
count <- counts[[category]]
120+
121+
score <- pweak
122+
if (polarity == "strongsubj") score <- pstrong
123+
score <- abs(log(score*prior/count))
124+
scores[[category]] <- scores[[category]]+score
125+
}
126+
}
127+
128+
for (key in names(scores)) {
129+
count <- counts[[key]]
130+
total <- counts[["total"]]
131+
score <- abs(log(count/total))
132+
scores[[key]] <- scores[[key]]+score
133+
}
134+
135+
best_fit <- names(scores)[which.max(unlist(scores))]
136+
137+
ratio <- abs(scores$positive/scores$negative)
138+
139+
if (ratio > 0.90 && ratio < 1.10)
140+
best_fit <- "neutral"
141+
142+
documents <- rbind(documents,c(scores$positive,scores$negative,abs(scores$positive/scores$negative),best_fit))
143+
}
144+
145+
colnames(documents) <- c("POS","NEG","POS/NEG","SENT")
146+
return(documents)
147+
}
148+
149+
classify.dtm <- function(textColumns, language="english",
150+
minDocFreq = 1, minWordLength = 4,
151+
removeNumbers = TRUE, removePunctuation = TRUE,
152+
removeStopwords = TRUE,
153+
stemWords = FALSE, stripWhitespace = TRUE,
154+
toLower = TRUE, weighting = weightTf) {
155+
156+
require('tm')
157+
control <- list(language = language, tolower = toLower,
158+
removeNumbers = removeNumbers, removePunctuation = removePunctuation,
159+
stripWhitespace = stripWhitespace, minWordLength = minWordLength,
160+
stopwords = removeStopwords, minDocFreq = minDocFreq,
161+
weighting = weighting)
162+
163+
if (stemWords == TRUE)
164+
control <- append(control, list(stemming = process.stemwords), after=6)
165+
166+
train <- apply(as.matrix(textColumns), 1, paste, collapse=" ")
167+
train <- sapply(as.vector(train, mode="character"),
168+
iconv, to="UTF8", sub="byte")
169+
170+
corpus <- Corpus(VectorSource(train), readerControl=list(language=language))
171+
matrix <- DocumentTermMatrix(corpus,control=control)
172+
gc() # garbage collect
173+
174+
return(matrix)
175+
}

R/score.R

Lines changed: 0 additions & 118 deletions
This file was deleted.

0 commit comments

Comments
 (0)