Skip to content

Commit 06b844f

Browse files
author
Brian Vaughn
committed
Optimized TfIdfSearchIndex tf-idf calculation by inlining sort functions
1 parent 91e15d8 commit 06b844f

File tree

1 file changed

+31
-38
lines changed

1 file changed

+31
-38
lines changed

source/SearchIndex/TfIdfSearchIndex.js

Lines changed: 31 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -108,55 +108,48 @@ export class TfIdfSearchIndex implements ISearchIndex {
108108
documents.push(uidToDocumentMap[uid]);
109109
}
110110

111-
// Return documents sorted by TF-IDF
112-
return documents.sort((documentA, documentB) =>
113-
this._calculateTfIdf(tokens, documentB, corpus) -
114-
this._calculateTfIdf(tokens, documentA, corpus)
115-
);
116-
}
111+
var tokenMap = this._tokenMap;
112+
var tokenToIdfCache = this._tokenToIdfCache;
113+
var uidFieldName = this._uidFieldName;
117114

118-
/**
119-
* Calculate the inverse document frequency of a search token. This calculation diminishes the weight of tokens that
120-
* occur very frequently in the set of searchable documents and increases the weight of terms that occur rarely.
121-
*/
122-
_calculateIdf(token : string, documents : Array<Object>) : number {
123-
if (!this._tokenToIdfCache[token]) {
124-
var numDocumentsWithToken:number = this._tokenMap[token] && this._tokenMap[token].$numDocumentOccurrences || 0;
115+
function calculateIdf(token : string, documents : Array<Object>) : number {
116+
if (!tokenToIdfCache[token]) {
117+
var numDocumentsWithToken:number = tokenMap[token] && tokenMap[token].$numDocumentOccurrences || 0;
125118

126-
this._tokenToIdfCache[token] = 1 + Math.log(documents.length / (1 + numDocumentsWithToken));
119+
tokenToIdfCache[token] = 1 + Math.log(documents.length / (1 + numDocumentsWithToken));
120+
}
121+
122+
return tokenToIdfCache[token];
127123
}
128124

129-
return this._tokenToIdfCache[token];
130-
}
125+
function calculateTfIdf(tokens : Array<string>, document : Object, documents : Array<Object>) : number {
126+
var score:number = 0;
131127

132-
/**
133-
* Calculate the term frequency–inverse document frequency (TF-IDF) ranking for a set of search tokens and a
134-
* document. The TF-IDF is a numeric statistic intended to reflect how important a word (or words) are to a document
135-
* in a corpus. The TF-IDF value increases proportionally to the number of times a word appears in the document but
136-
* is offset by the frequency of the word in the corpus. This helps to adjust for the fact that some words appear
137-
* more frequently in general (e.g. a, and, the).
138-
*/
139-
_calculateTfIdf(tokens : Array<string>, document : Object, documents : Array<Object>) : number {
140-
var score:number = 0;
128+
for (var i = 0, numTokens = tokens.length; i < numTokens; ++i) {
129+
var token:string = tokens[i];
141130

142-
for (var i = 0, numTokens = tokens.length; i < numTokens; ++i) {
143-
var token:string = tokens[i];
131+
var inverseDocumentFrequency:number = calculateIdf(token, documents);
144132

145-
var inverseDocumentFrequency:number = this._calculateIdf(token, documents);
133+
if (inverseDocumentFrequency === Infinity) {
134+
inverseDocumentFrequency = 0;
135+
}
146136

147-
if (inverseDocumentFrequency === Infinity) {
148-
inverseDocumentFrequency = 0;
149-
}
137+
var uid:any = document && document[uidFieldName];
138+
var termFrequency:number =
139+
tokenMap[token] &&
140+
tokenMap[token].$uidMap[uid] &&
141+
tokenMap[token].$uidMap[uid].$numTokenOccurrences || 0;
150142

151-
var uid:any = document && document[this._uidFieldName];
152-
var termFrequency:number =
153-
this._tokenMap[token] &&
154-
this._tokenMap[token].$uidMap[uid] &&
155-
this._tokenMap[token].$uidMap[uid].$numTokenOccurrences || 0;
143+
score += termFrequency * inverseDocumentFrequency;
144+
}
156145

157-
score += termFrequency * inverseDocumentFrequency;
146+
return score;
158147
}
159148

160-
return score;
149+
// Return documents sorted by TF-IDF
150+
return documents.sort((documentA, documentB) =>
151+
calculateTfIdf(tokens, documentB, corpus) -
152+
calculateTfIdf(tokens, documentA, corpus)
153+
);
161154
}
162155
};

0 commit comments

Comments
 (0)