feat: add bowOf() method in bm25 vectorizer

sanjayaksaxena · sanjayaksaxena · commit 0b7b72ba33d3 · 2022-01-30T18:12:54.000+05:30
diff --git a/test/bm25-vectorizer-specs.js b/test/bm25-vectorizer-specs.js
@@ -120,6 +120,10 @@ describe( 'bm25-vectorizer', function () {
       expect( v.vectorOf( [ 'rain', 'is', 'going', 'away' ] ) ).to.deep.equal( [ 0.287682, 0, 0.287682 ] );
     } );
 
+    it( 'bowOf() should return bow of tokens', function () {
+      expect( v.bowOf( [ 'rain', 'is', 'going', 'away' ] ) ).to.deep.equal( { away: 0.287682, rain: 0.287682 } );
+    } );
+
     it( 'doc.out( its.tf ) should return freq table of terms', function () {
       expect( v.doc( 0 ).out( its.tf ) ).to.deep.equal( [ [ 'rain', 0.395563 ], [ 'away', 0.287682 ], [ 'go', 0.287682 ] ] );
     } );
@@ -183,6 +187,10 @@ describe( 'bm25-vectorizer', function () {
     it( 'vectorOf() should return its vector', function () {
       expect( v.vectorOf( 'rats were blue'.split( /\s+/g ) ) ).to.deep.equal( [ 0, 0, 0.901808, 0, 0.432138, 0, 0 ] );
     } );
+
+    it( 'bowOf() should return its bow', function () {
+      expect( v.bowOf( 'rats were blue'.split( /\s+/g ) ) ).to.deep.equal( { blue: 0.901808, rats: 0.432138 } );
+    } );
   } );
 
   describe( 'learn from multiple documents with l1 norm', function () {
@@ -252,6 +260,10 @@ describe( 'bm25-vectorizer', function () {
     it( 'should return 0-vector', function () {
       expect( v.vectorOf([ 'cat', 'cat', 'green', 'is' ] ) ).to.deep.equal( [ 0, 0, 0, 0, 0 ] );
     } );
+
+    it( 'should return empty bow', function () {
+      expect( v.bowOf([ 'cat', 'cat', 'green', 'is' ] ) ).to.deep.equal( {} );
+    } );
   } );
 
   describe( 'completely OOV tokens with l2 norm', function () {
@@ -264,6 +276,10 @@ describe( 'bm25-vectorizer', function () {
     it( 'should return 0-vector', function () {
       expect( v.vectorOf([ 'cat', 'cat', 'green', 'is' ] ) ).to.deep.equal( [ 0, 0, 0, 0, 0 ] );
     } );
+
+    it( 'should return empty bow', function () {
+      expect( v.bowOf([ 'cat', 'cat', 'green', 'is' ] ) ).to.deep.equal( {} );
+    } );
   } );
 
   describe( 'load model json', function () {
diff --git a/types/index.d.ts b/types/index.d.ts
@@ -304,7 +304,7 @@ declare module 'wink-nlp/utilities/bm25-vectorizer' {
   // turn off exporting by default since we don't want to expose internal details
   export { };
 
-  import { Tokens, Document, ItsFunction } from 'wink-nlp';
+  import { Tokens, Document, ItsFunction, Bow } from 'wink-nlp';
 
   export type Norm = "l1" | "l2" | "none";
 
@@ -320,6 +320,7 @@ declare module 'wink-nlp/utilities/bm25-vectorizer' {
     out<T>(f: ItsFunction<T>): T;
     doc(n: number): Document;
     vectorOf(tokens: Tokens): number[];
+    bowOf(tokens: Tokens): Bow;
     config(): BM25VectorizerConfig;
     loadModel(json: string): void;
   }
diff --git a/utilities/bm25-vectorizer.js b/utilities/bm25-vectorizer.js
@@ -276,7 +276,7 @@ var bm25Vectorizer = function ( config ) {
   /**
    * Computes the vector of the input document given in form of tokens using
    * the tf-idf learned so far.
-   * @param  {string}     tokens  tokenized document, usually obtained via winkNLP.
+   * @param  {string[]}   tokens  tokenized document, usually obtained via winkNLP.
    * @return {number[]}           its vector.
    */
   methods.vectorOf = function ( tokens ) {
@@ -302,9 +302,47 @@ var bm25Vectorizer = function ( config ) {
     } else if ( norm === NONE ) thisNorm = 1;
 
     // `thisNorm || 1` ensures that there is no attempt to divide by zero!
+    // This may happen if all tokens are unseen.
     return arr.map( ( v ) => +( v / ( thisNorm || 1 ) ).toFixed( precision ) );
   }; // vectorOf()
 
+  // ## bowOf
+  /**
+   * Computes the bag-of-words (bowOf) of the input document, using the tf-idf
+   * learned so far.
+   * @param  {string[]}   tokens  tokenized text, usually obtained via winkNLP.
+   * @return {object}             its bow.
+   */
+  methods.bowOf = function ( tokens ) {
+    computeWeights();
+    const bow = Object.create( null );
+    const avgDL = sumOfAllDLs / docId;
+    let thisNorm = 0;
+
+    for ( let i = 0; i < tokens.length; i += 1 ) {
+      const t = tokens[ i ];
+      // bow applies only if the token is not an unseen one!
+      if ( idf[ t ] ) bow[ t ] = 1 + ( bow[ t ] || 0 );
+    }
+
+    for ( const t in bow ) { // eslint-disable-line guard-for-in
+      bow[ t ] = idf[ t ] * ( ( k1 + 1 ) * bow[ t ] ) / ( ( k1 * ( 1 - b + ( b * ( tokens.length / avgDL ) ) ) ) + bow[ t ] );
+      thisNorm += normFn[ norm ]( bow[ t ] );
+    }
+
+    if ( norm === L2 ) {
+      thisNorm = Math.sqrt( thisNorm );
+    } else if ( norm === NONE ) thisNorm = 1;
+
+    for ( const t in bow ) { // eslint-disable-line guard-for-in
+      // Unlike in `vectorOf`, `thisNorm || 1` is not needed here as bow will be
+      // empty if `thisNorm` is zero!
+      bow[ t ] = +( bow[ t ] / thisNorm ).toFixed( precision );
+    }
+
+    return bow;
+  }; // bowOf()
+
   methods.config = ( () => ( { k: k, k1: k1, b: b, norm: norm } ) );
 
   // ## loadModel