Skip to content

Commit 0b7b72b

Browse files
feat: add bowOf() method in bm25 vectorizer
1 parent 68fb316 commit 0b7b72b

File tree

3 files changed

+57
-2
lines changed

3 files changed

+57
-2
lines changed

test/bm25-vectorizer-specs.js

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,10 @@ describe( 'bm25-vectorizer', function () {
120120
expect( v.vectorOf( [ 'rain', 'is', 'going', 'away' ] ) ).to.deep.equal( [ 0.287682, 0, 0.287682 ] );
121121
} );
122122

123+
it( 'bowOf() should return bow of tokens', function () {
124+
expect( v.bowOf( [ 'rain', 'is', 'going', 'away' ] ) ).to.deep.equal( { away: 0.287682, rain: 0.287682 } );
125+
} );
126+
123127
it( 'doc.out( its.tf ) should return freq table of terms', function () {
124128
expect( v.doc( 0 ).out( its.tf ) ).to.deep.equal( [ [ 'rain', 0.395563 ], [ 'away', 0.287682 ], [ 'go', 0.287682 ] ] );
125129
} );
@@ -183,6 +187,10 @@ describe( 'bm25-vectorizer', function () {
183187
it( 'vectorOf() should return its vector', function () {
184188
expect( v.vectorOf( 'rats were blue'.split( /\s+/g ) ) ).to.deep.equal( [ 0, 0, 0.901808, 0, 0.432138, 0, 0 ] );
185189
} );
190+
191+
it( 'bowOf() should return its bow', function () {
192+
expect( v.bowOf( 'rats were blue'.split( /\s+/g ) ) ).to.deep.equal( { blue: 0.901808, rats: 0.432138 } );
193+
} );
186194
} );
187195

188196
describe( 'learn from multiple documents with l1 norm', function () {
@@ -252,6 +260,10 @@ describe( 'bm25-vectorizer', function () {
252260
it( 'should return 0-vector', function () {
253261
expect( v.vectorOf([ 'cat', 'cat', 'green', 'is' ] ) ).to.deep.equal( [ 0, 0, 0, 0, 0 ] );
254262
} );
263+
264+
it( 'should return empty bow', function () {
265+
expect( v.bowOf([ 'cat', 'cat', 'green', 'is' ] ) ).to.deep.equal( {} );
266+
} );
255267
} );
256268

257269
describe( 'completely OOV tokens with l2 norm', function () {
@@ -264,6 +276,10 @@ describe( 'bm25-vectorizer', function () {
264276
it( 'should return 0-vector', function () {
265277
expect( v.vectorOf([ 'cat', 'cat', 'green', 'is' ] ) ).to.deep.equal( [ 0, 0, 0, 0, 0 ] );
266278
} );
279+
280+
it( 'should return empty bow', function () {
281+
expect( v.bowOf([ 'cat', 'cat', 'green', 'is' ] ) ).to.deep.equal( {} );
282+
} );
267283
} );
268284

269285
describe( 'load model json', function () {

types/index.d.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ declare module 'wink-nlp/utilities/bm25-vectorizer' {
304304
// turn off exporting by default since we don't want to expose internal details
305305
export { };
306306

307-
import { Tokens, Document, ItsFunction } from 'wink-nlp';
307+
import { Tokens, Document, ItsFunction, Bow } from 'wink-nlp';
308308

309309
export type Norm = "l1" | "l2" | "none";
310310

@@ -320,6 +320,7 @@ declare module 'wink-nlp/utilities/bm25-vectorizer' {
320320
out<T>(f: ItsFunction<T>): T;
321321
doc(n: number): Document;
322322
vectorOf(tokens: Tokens): number[];
323+
bowOf(tokens: Tokens): Bow;
323324
config(): BM25VectorizerConfig;
324325
loadModel(json: string): void;
325326
}

utilities/bm25-vectorizer.js

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ var bm25Vectorizer = function ( config ) {
276276
/**
277277
* Computes the vector of the input document given in form of tokens using
278278
* the tf-idf learned so far.
279-
* @param {string} tokens tokenized document, usually obtained via winkNLP.
279+
* @param {string[]} tokens tokenized document, usually obtained via winkNLP.
280280
* @return {number[]} its vector.
281281
*/
282282
methods.vectorOf = function ( tokens ) {
@@ -302,9 +302,47 @@ var bm25Vectorizer = function ( config ) {
302302
} else if ( norm === NONE ) thisNorm = 1;
303303

304304
// `thisNorm || 1` ensures that there is no attempt to divide by zero!
305+
// This may happen if all tokens are unseen.
305306
return arr.map( ( v ) => +( v / ( thisNorm || 1 ) ).toFixed( precision ) );
306307
}; // vectorOf()
307308

309+
// ## bowOf
310+
/**
311+
* Computes the bag-of-words (bowOf) of the input document, using the tf-idf
312+
* learned so far.
313+
* @param {string[]} tokens tokenized text, usually obtained via winkNLP.
314+
* @return {object} its bow.
315+
*/
316+
methods.bowOf = function ( tokens ) {
317+
computeWeights();
318+
const bow = Object.create( null );
319+
const avgDL = sumOfAllDLs / docId;
320+
let thisNorm = 0;
321+
322+
for ( let i = 0; i < tokens.length; i += 1 ) {
323+
const t = tokens[ i ];
324+
// bow applies only if the token is not an unseen one!
325+
if ( idf[ t ] ) bow[ t ] = 1 + ( bow[ t ] || 0 );
326+
}
327+
328+
for ( const t in bow ) { // eslint-disable-line guard-for-in
329+
bow[ t ] = idf[ t ] * ( ( k1 + 1 ) * bow[ t ] ) / ( ( k1 * ( 1 - b + ( b * ( tokens.length / avgDL ) ) ) ) + bow[ t ] );
330+
thisNorm += normFn[ norm ]( bow[ t ] );
331+
}
332+
333+
if ( norm === L2 ) {
334+
thisNorm = Math.sqrt( thisNorm );
335+
} else if ( norm === NONE ) thisNorm = 1;
336+
337+
for ( const t in bow ) { // eslint-disable-line guard-for-in
338+
// Unlike in `vectorOf`, `thisNorm || 1` is not needed here as bow will be
339+
// empty if `thisNorm` is zero!
340+
bow[ t ] = +( bow[ t ] / thisNorm ).toFixed( precision );
341+
}
342+
343+
return bow;
344+
}; // bowOf()
345+
308346
methods.config = ( () => ( { k: k, k1: k1, b: b, norm: norm } ) );
309347

310348
// ## loadModel

0 commit comments

Comments
 (0)