-
Notifications
You must be signed in to change notification settings - Fork 213
Expand file tree
/
Copy pathstopwords.js
More file actions
113 lines (113 loc) · 3.85 KB
/
stopwords.js
File metadata and controls
113 lines (113 loc) · 3.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
// Generated by CoffeeScript 2.0.0-beta7
void function () {
var _, cache, candiateWords, fs, getFilePath, getStopwords, path, removePunctuation, stopwords;
path = require('path');
fs = require('fs');
_ = require('lodash');
cache = {};
getStopwords = function (lang) {
switch (lang) {
case 'ar':
return require('../data/stopwords/stopwords-ar.txt');
case 'bg':
return require('../data/stopwords/stopwords-bg.txt');
case 'cs':
return require('../data/stopwords/stopwords-cs.txt');
case 'da':
return require('../data/stopwords/stopwords-da.txt');
case 'de':
return require('../data/stopwords/stopwords-de.txt');
case 'en':
return require('../data/stopwords/stopwords-en.txt');
case 'es':
return require('../data/stopwords/stopwords-es.txt');
case 'fi':
return require('../data/stopwords/stopwords-fi.txt');
case 'fr':
return require('../data/stopwords/stopwords-fr.txt');
case 'hu':
return require('../data/stopwords/stopwords-hu.txt');
case 'id':
return require('../data/stopwords/stopwords-id.txt');
case 'it':
return require('../data/stopwords/stopwords-it.txt');
case 'ko':
return require('../data/stopwords/stopwords-ko.txt');
case 'nb':
return require('../data/stopwords/stopwords-nb.txt');
case 'nl':
return require('../data/stopwords/stopwords-nl.txt');
case 'no':
return require('../data/stopwords/stopwords-no.txt');
case 'pl':
return require('../data/stopwords/stopwords-pl.txt');
case 'pt':
return require('../data/stopwords/stopwords-pt.txt');
case 'ru':
return require('../data/stopwords/stopwords-ru.txt');
case 'sv':
return require('../data/stopwords/stopwords-sv.txt');
case 'th':
return require('../data/stopwords/stopwords-th.txt');
case 'tr':
return require('../data/stopwords/stopwords-tr.txt');
case 'zh':
return require('../data/stopwords/stopwords-zh.txt');
default:
return require('../data/stopwords/stopwords-en.txt');
}
};
getFilePath = function (language) {
return path.join(__dirname, '..', 'data', 'stopwords', 'stopwords-' + language + '.txt');
};
module.exports = stopwords = function (content, language) {
var count, filePath, hasFs, overlappingStopwords, stopWords, strippedInput, words;
if (null == language)
language = 'en';
hasFs = in$('existsSync', fs);
if (hasFs) {
filePath = getFilePath(language);
if (!fs.existsSync(filePath)) {
console.error("WARNING: No stopwords file found for '" + language + "' - defaulting to English!");
filePath = getFilePath('en');
}
}
if (cache.hasOwnProperty(language)) {
stopWords = cache[language];
} else if (!hasFs) {
stopWords = getStopwords(language);
cache[language] = stopWords;
} else {
stopWords = fs.readFileSync(filePath).toString().split('\n').filter(function (s) {
return s.length > 0;
});
cache[language] = stopWords;
}
strippedInput = removePunctuation(content);
words = candiateWords(strippedInput);
overlappingStopwords = [];
count = 0;
_.each(words, function (w) {
count += 1;
if (stopWords.indexOf(w.toLowerCase()) > -1)
return overlappingStopwords.push(w.toLowerCase());
});
return {
wordCount: count,
stopwordCount: overlappingStopwords.length,
stopWords: overlappingStopwords
};
};
removePunctuation = function (content) {
return content.replace(/[\|\@\<\>\[\]\"\'\.,-\/#\?!$%\^&\*\+;:{}=\-_`~()]/g, '');
};
candiateWords = function (strippedInput) {
return strippedInput.split(' ');
};
function in$(member, list) {
for (var i = 0, length = list.length; i < length; ++i)
if (i in list && list[i] === member)
return true;
return false;
}
}.call(this);