|
| 1 | +#simple aggregation function with regex driven gsub |
| 2 | +classify.aggregate = function(sentences, pos.words, neg.words, .progress='none') |
| 3 | +{ |
| 4 | + require(plyr) |
| 5 | + require(stringr) |
| 6 | + |
| 7 | + # array ("a") of scores back = "l" + "a" + "ply" = "laply": |
| 8 | + scores = laply(sentences, function(sentence, pos.words, neg.words) { |
| 9 | + |
| 10 | + sentence = gsub('[[:punct:]]', '', sentence) |
| 11 | + sentence = gsub('[[:cntrl:]]', '', sentence) |
| 12 | + sentence = gsub("[[:digit:]]", "", sentence) |
| 13 | + sentence = gsub('\\d+', '', sentence) |
| 14 | + sentence = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", sentence) |
| 15 | + sentence = gsub("@\\w+", "", sentence) |
| 16 | + sentence = gsub("http\\w+", "", sentence) |
| 17 | + sentence = gsub("[ \t]{2,}", "", sentence) |
| 18 | + sentence = gsub("^\\s+|\\s+$", "", sentence) |
| 19 | + |
| 20 | + # define "tolower error handling" function |
| 21 | + try.error = function(x) |
| 22 | + { |
| 23 | + y = NA |
| 24 | + try_error = tryCatch(tolower(x), error=function(e) e) |
| 25 | + |
| 26 | + # if not an error |
| 27 | + if (!inherits(try_error, "error")) |
| 28 | + y = tolower(x) |
| 29 | + |
| 30 | + return(y) |
| 31 | + } |
| 32 | + |
| 33 | + sentence = sapply(sentence, try.error) |
| 34 | + sentence = sentence[!is.na(sentence)] |
| 35 | + names(sentence) = NULL |
| 36 | + |
| 37 | + word.list = str_split(sentence, '\\s+') |
| 38 | + words = unlist(word.list) |
| 39 | + |
| 40 | + pos.matches = match(words, pos.words) |
| 41 | + neg.matches = match(words, neg.words) |
| 42 | + |
| 43 | + # match() returns the position of the matched term or NA (we only want a boolean value) |
| 44 | + # which is fixed by sum |
| 45 | + pos.matches = !is.na(pos.matches) |
| 46 | + neg.matches = !is.na(neg.matches) |
| 47 | + |
| 48 | + score = sum(pos.matches) - sum(neg.matches) |
| 49 | + |
| 50 | + return(score) |
| 51 | + }, pos.words, neg.words, .progress=.progress) |
| 52 | + |
| 53 | + # returns a data frame |
| 54 | + scores.df = data.frame(score=scores, text=sentences) |
| 55 | + return(scores.df) |
| 56 | +} |
| 57 | + |
| 58 | +# Uses ViralHeat's API to calculate the sentiment of a given piece of text. |
| 59 | +# Note that maximum number of characters is 360 |
| 60 | +classify.viralheat = function (text, api.key) { |
| 61 | + library(RCurl); |
| 62 | + library(RJSONIO); |
| 63 | + |
| 64 | + text <- URLencode(text); |
| 65 | + text <- str_replace_all(text, "%20", " "); |
| 66 | + text <- str_replace_all(text, "%\\d\\d", ""); |
| 67 | + text <- str_replace_all(text, " ", "%20"); |
| 68 | + |
| 69 | + if (str_length(text) > 360){ |
| 70 | + text <- substr(text, 0, 359); |
| 71 | + } |
| 72 | + |
| 73 | + data <- getURL(paste("http://www.viralheat.com/api/sentiment/review.json?text=",text,"&api_key=", api.key, sep="")) |
| 74 | + |
| 75 | + js <- fromJSON(data, asText=TRUE); |
| 76 | + |
| 77 | + sent <- js$prob; |
| 78 | + |
| 79 | + j <<- js; |
| 80 | + |
| 81 | + if (js$mood == "negative") { |
| 82 | + sent <- sent * -1; |
| 83 | + } |
| 84 | + else { |
| 85 | + if (js$mood != "positive") { |
| 86 | + #must be neutral |
| 87 | + sent <- 0; |
| 88 | + } |
| 89 | + } |
| 90 | + |
| 91 | + return(sent); |
| 92 | +} |
| 93 | + |
| 94 | +classify.naivebayes <- function(textColumns, pstrong=0.5, |
| 95 | + pweak=1.0, prior=1.0, ...) { |
| 96 | + |
| 97 | + matrix <- classify.dtm(textColumns, ...) |
| 98 | + fpath <- system.file("extdata", "subjectivity.csv", package="sentR") |
| 99 | + lexicon <- read.csv(fpath, head = FALSE) |
| 100 | + |
| 101 | + counts <- list(positive = length(which(lexicon[,3]=="positive")), |
| 102 | + negative = length(which(lexicon[,3]=="negative")), |
| 103 | + total = nrow(lexicon)) |
| 104 | + |
| 105 | + documents <- c() |
| 106 | + |
| 107 | + for (i in 1:nrow(matrix)) { |
| 108 | + scores <- list(positive=0, negative=0) |
| 109 | + doc <- matrix[i,] |
| 110 | + words <- findFreqTerms(doc, lowfreq=1) |
| 111 | + |
| 112 | + for (word in words) { |
| 113 | + index <- pmatch(word,lexicon[,1],nomatch=0) |
| 114 | + if (index > 0) { |
| 115 | + entry <- lexicon[index,] |
| 116 | + |
| 117 | + polarity <- as.character(entry[[2]]) |
| 118 | + category <- as.character(entry[[3]]) |
| 119 | + count <- counts[[category]] |
| 120 | + |
| 121 | + score <- pweak |
| 122 | + if (polarity == "strongsubj") score <- pstrong |
| 123 | + score <- abs(log(score*prior/count)) |
| 124 | + scores[[category]] <- scores[[category]]+score |
| 125 | + } |
| 126 | + } |
| 127 | + |
| 128 | + for (key in names(scores)) { |
| 129 | + count <- counts[[key]] |
| 130 | + total <- counts[["total"]] |
| 131 | + score <- abs(log(count/total)) |
| 132 | + scores[[key]] <- scores[[key]]+score |
| 133 | + } |
| 134 | + |
| 135 | + best_fit <- names(scores)[which.max(unlist(scores))] |
| 136 | + |
| 137 | + ratio <- abs(scores$positive/scores$negative) |
| 138 | + |
| 139 | + if (ratio > 0.90 && ratio < 1.10) |
| 140 | + best_fit <- "neutral" |
| 141 | + |
| 142 | + documents <- rbind(documents,c(scores$positive,scores$negative,abs(scores$positive/scores$negative),best_fit)) |
| 143 | + } |
| 144 | + |
| 145 | + colnames(documents) <- c("POS","NEG","POS/NEG","SENT") |
| 146 | + return(documents) |
| 147 | +} |
| 148 | + |
| 149 | +classify.dtm <- function(textColumns, language="english", |
| 150 | + minDocFreq = 1, minWordLength = 4, |
| 151 | + removeNumbers = TRUE, removePunctuation = TRUE, |
| 152 | + removeStopwords = TRUE, |
| 153 | + stemWords = FALSE, stripWhitespace = TRUE, |
| 154 | + toLower = TRUE, weighting = weightTf) { |
| 155 | + |
| 156 | + require('tm') |
| 157 | + control <- list(language = language, tolower = toLower, |
| 158 | + removeNumbers = removeNumbers, removePunctuation = removePunctuation, |
| 159 | + stripWhitespace = stripWhitespace, minWordLength = minWordLength, |
| 160 | + stopwords = removeStopwords, minDocFreq = minDocFreq, |
| 161 | + weighting = weighting) |
| 162 | + |
| 163 | + if (stemWords == TRUE) |
| 164 | + control <- append(control, list(stemming = process.stemwords), after=6) |
| 165 | + |
| 166 | + train <- apply(as.matrix(textColumns), 1, paste, collapse=" ") |
| 167 | + train <- sapply(as.vector(train, mode="character"), |
| 168 | + iconv, to="UTF8", sub="byte") |
| 169 | + |
| 170 | + corpus <- Corpus(VectorSource(train), readerControl=list(language=language)) |
| 171 | + matrix <- DocumentTermMatrix(corpus,control=control) |
| 172 | + gc() # garbage collect |
| 173 | + |
| 174 | + return(matrix) |
| 175 | +} |
0 commit comments