Skip to content

Commit a39043b

Browse files
test(bm25-vectorizer): add test cases
1 parent 313f2af commit a39043b

File tree

1 file changed

+306
-0
lines changed

1 file changed

+306
-0
lines changed

test/bm25-vectorizer-specs.js

Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
// wink-nlp
2+
//
3+
// Copyright (C) GRAYPE Systems Private Limited
4+
//
5+
// This file is part of “wink-nlp”.
6+
//
7+
// Permission is hereby granted, free of charge, to any
8+
// person obtaining a copy of this software and
9+
// associated documentation files (the "Software"), to
10+
// deal in the Software without restriction, including
11+
// without limitation the rights to use, copy, modify,
12+
// merge, publish, distribute, sublicense, and/or sell
13+
// copies of the Software, and to permit persons to
14+
// whom the Software is furnished to do so, subject to
15+
// the following conditions:
16+
//
17+
// The above copyright notice and this permission notice
18+
// shall be included in all copies or substantial
19+
// portions of the Software.
20+
//
21+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
22+
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
23+
// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
24+
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
25+
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
26+
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
27+
// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
28+
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
29+
// DEALINGS IN THE SOFTWARE.
30+
31+
//
32+
33+
var chai = require( 'chai' );
34+
var mocha = require( 'mocha' );
35+
var bm25 = require( '../utilities/bm25-vectorizer.js' );
36+
var its = require( '../src/its.js' );
37+
38+
39+
var expect = chai.expect;
40+
var describe = mocha.describe;
41+
var it = mocha.it;
42+
43+
describe( 'bm25-vectorizer', function () {
44+
describe( 'configurations', function () {
45+
it( 'empty config should return default values', function () {
46+
expect( bm25().config() ).to.deep.equal( { k: 1, k1: 1.2, b: 0.75, norm: 'none' } );
47+
} );
48+
49+
it( 'null config should return default values', function () {
50+
expect( bm25().config( { k: null, k1: null, b: null, norm: null } ) ).to.deep.equal( { k: 1, k1: 1.2, b: 0.75, norm: 'none' } );
51+
} );
52+
53+
it( 'NaN/non-string config should return default values', function () {
54+
expect( bm25( { k: [], k1: {}, b: 'null', norm: 'null' } ).config() ).to.deep.equal( { k: 1, k1: 1.2, b: 0.75, norm: 'none' } );
55+
} );
56+
57+
it( '-ve values in config should return default values', function () {
58+
expect( bm25( { k: -1, k1: -1, b: -1, norm: 'null' } ).config() ).to.deep.equal( { k: 1, k1: 1.2, b: 0.75, norm: 'none' } );
59+
} );
60+
61+
it( 'large +ve values in config should return default values', function () {
62+
expect( bm25( { k: 101, k1: 101, b: 2, norm: 'null' } ).config() ).to.deep.equal( { k: 1, k1: 1.2, b: 0.75, norm: 'none' } );
63+
} );
64+
65+
it( 'valid values in config should return defined values', function () {
66+
expect( bm25( { k: 2, k1: 1.5, b: 0.6, norm: 'l2' } ).config() ).to.deep.equal( { k: 2, k1: 1.5, b: 0.60, norm: 'l2' } );
67+
} );
68+
} );
69+
70+
describe( 'learn from 0-documents', function () {
71+
const v = bm25();
72+
it( '.out() should return []', function () {
73+
expect( v.out() ).to.deep.equal( [] );
74+
} );
75+
76+
it( '.out( its.docTermMatrix ) should return []', function () {
77+
expect( v.out( its.docTermMatrix ) ).to.deep.equal( [] );
78+
} );
79+
80+
it( '.out( its.docBOWArray ) should return []', function () {
81+
expect( v.out( its.docBOWArray ) ).to.deep.equal( [] );
82+
} );
83+
84+
it( '.out( its.terms ) should return []', function () {
85+
expect( v.out( its.terms ) ).to.deep.equal( [] );
86+
} );
87+
88+
it( '.out( its.idf ) should return []', function () {
89+
expect( v.out( its.idf ) ).to.deep.equal( [] );
90+
} );
91+
92+
it( '.out( its.modelJSON ) should return []', function () {
93+
expect( v.out( its.modelJSON ) ).to.deep.equal( JSON.stringify( { tf: [], idf: {} } ) );
94+
} );
95+
96+
it( '.length() should return []', function () {
97+
expect( v.length() ).to.equal( 0 );
98+
} );
99+
100+
it( 'doc.out() should return undefined', function () {
101+
expect( v.doc( 0 ).out() ).to.deep.equal( undefined );
102+
} );
103+
104+
it( 'doc.out( its.tf ) should return []', function () {
105+
expect( v.doc( 0 ).out( its.tf ) ).to.deep.equal( [] );
106+
} );
107+
108+
it( 'doc.out( its.vector ) should return []', function () {
109+
expect( v.doc( 0 ).out( its.vector ) ).to.deep.equal( [] );
110+
} );
111+
112+
it( 'doc.out( its.bow ) should return []', function () {
113+
expect( v.doc( 0 ).out( its.bow ) ).to.deep.equal( undefined );
114+
} );
115+
116+
it( 'doc.length() should return []', function () {
117+
expect( v.doc( 0 ).length() ).to.equal( 0 );
118+
} );
119+
120+
// it( 'should throw error if readDoc is given non-text', function () {
121+
// expect( nlp.readDoc.bind( 1 ) ).to.throw( /^wink-nlp: expecting a valid Javascript string/ );
122+
// } );
123+
} );
124+
125+
describe( 'learn from 1-document', function () {
126+
const bow = { rain: 0.395562849, go: 0.287682072, away: 0.287682072 };
127+
const json = '{"tf":[{"rain":0.395562849,"go":0.287682072,"away":0.287682072}],"idf":{"rain":0.287682072,"go":0.287682072,"away":0.287682072}}';
128+
const v = bm25();
129+
v.learn( 'rain rain go away'.split( /\s+/g ) );
130+
131+
it( 'out() should return [ bow ]', function () {
132+
expect( v.out() ).to.deep.equal( [ bow ] );
133+
} );
134+
135+
it( 'out( its.bow ) should return [ bow ]', function () {
136+
expect( v.out( its.bow ) ).to.deep.equal( [ bow ] );
137+
} );
138+
139+
it( 'out( its.docTermMatrix ) should return document term matrix', function () {
140+
expect( v.out( its.docTermMatrix ) ).to.deep.equal( [ [ 0.287682072, 0.287682072, 0.395562849 ] ] );
141+
} );
142+
143+
it( 'out( its.docBOWArray ) should return [ bow ]', function () {
144+
expect( v.out( its.docBOWArray ) ).to.deep.equal( [ bow ] );
145+
} );
146+
147+
it( 'out( its.idf ) should return [ bow ]', function () {
148+
expect( v.out( its.idf ) ).to.deep.equal( [ [ 'away', 0.287682072 ], [ 'go', 0.287682072 ], [ 'rain', 0.287682072 ] ] );
149+
} );
150+
151+
it( 'out( its.terms ) should return sorted array of terms', function () {
152+
expect( v.out( its.terms ) ).to.deep.equal( [ 'away', 'go', 'rain' ] );
153+
} );
154+
155+
it( 'out( its.modelJSON ) should return models JSON', function () {
156+
expect( v.out( its.modelJSON ) ).to.deep.equal( json );
157+
} );
158+
159+
it( 'vectorOf() should return vector of tokens', function () {
160+
expect( v.vectorOf( [ 'rain', 'is', 'going', 'away' ] ) ).to.deep.equal( [ 0.287682072, 0, 0.287682072 ] );
161+
} );
162+
163+
it( 'length() should return 3', function () {
164+
expect( v.length() ).to.equal( 3 );
165+
} );
166+
167+
it( 'doc.out( its.tf ) should return freq table of terms', function () {
168+
expect( v.doc( 0 ).out( its.tf ) ).to.deep.equal( [ [ 'rain', 0.395562849 ], [ 'away', 0.287682072 ], [ 'go', 0.287682072 ] ] );
169+
} );
170+
171+
it( 'doc.out( its.vector ) should return its vector', function () {
172+
expect( v.doc( 0 ).out( its.vector ) ).to.deep.equal( [ 0.287682072, 0.287682072, 0.395562849 ] );
173+
} );
174+
175+
it( 'doc.out( its.bow ) should return its bow', function () {
176+
expect( v.doc( 0 ).out( its.bow ) ).to.deep.equal( bow );
177+
} );
178+
179+
it( 'doc.length() should return 3', function () {
180+
expect( v.doc( 0 ).length() ).to.equal( 3 );
181+
} );
182+
} );
183+
184+
describe( 'learn from multiple documents with l2 norm', function () {
185+
const terms = [ 'are', 'black', 'blue', 'cats', 'rats', 'some', 'white' ];
186+
const idf = [
187+
[ 'black', 0.980829253 ],
188+
[ 'blue', 0.980829253 ],
189+
[ 'cats', 0.980829253 ],
190+
[ 'some', 0.980829253 ],
191+
[ 'white', 0.980829253 ],
192+
[ 'rats', 0.470003629 ],
193+
[ 'are', 0.133531393 ]
194+
];
195+
const dtm = [
196+
[ 0.121858341, 0, 0.895087087, 0, 0.428916835, 0, 0 ],
197+
[ 0.095823468, 0, 0, 0.703852919, 0, 0, 0.703852919 ],
198+
[ 0.086275085, 0.633717097, 0, 0, 0.435157318, 0.633717097, 0 ]
199+
];
200+
const v = bm25( { norm: 'l2' } );
201+
v.learn( 'rats are blue'.split( /\s+/g ) );
202+
v.learn( 'cats are white'.split( /\s+/g ) );
203+
v.learn( 'some rats rats are black'.split( /\s+/g ) );
204+
205+
it( 'doc.out( its.vector ) should return its vector', function () {
206+
expect( v.doc( 2 ).out( its.vector ) ).to.deep.equal( [ 0.086275085, 0.633717097, 0, 0, 0.435157318, 0.633717097, 0 ] );
207+
} );
208+
209+
it( 'out( its.idf ) should return its idfs freq table', function () {
210+
expect( v.out( its.idf ) ).to.deep.equal( idf );
211+
} );
212+
213+
it( 'out( its.terms ) should return its doc terms in alpha sort', function () {
214+
expect( v.out( its.terms ) ).to.deep.equal( terms );
215+
} );
216+
217+
it( 'out( its.docTermMatrix ) should return its doc term matrix', function () {
218+
expect( v.out( its.docTermMatrix ) ).to.deep.equal( dtm );
219+
} );
220+
221+
it( 'vectorOf() should return its vector', function () {
222+
expect( v.vectorOf( 'rats were blue'.split( /\s+/g ) ) ).to.deep.equal( [ 0, 0, 0.901807807, 0, 0.432137338, 0, 0 ] );
223+
} );
224+
} );
225+
226+
describe( 'learn from multiple documents with l1 norm', function () {
227+
const terms = [ 'are', 'black', 'blue', 'cats', 'rats', 'some', 'white' ];
228+
const idf = [
229+
[ 'black', 0.980829253 ],
230+
[ 'blue', 0.980829253 ],
231+
[ 'cats', 0.980829253 ],
232+
[ 'some', 0.980829253 ],
233+
[ 'white', 0.980829253 ],
234+
[ 'rats', 0.470003629 ],
235+
[ 'are', 0.133531393 ]
236+
];
237+
const dtm = [
238+
[ 0.08428074, 0, 0.619068019, 0, 0.296651241, 0, 0 ],
239+
[ 0.063732358, 0, 0, 0.468133821, 0, 0, 0.468133821 ],
240+
[ 0.048228909, 0.354256208, 0, 0, 0.243258675, 0.354256208, 0 ]
241+
];
242+
const v = bm25( { norm: 'l1' } );
243+
v.learn( 'rats are blue'.split( /\s+/g ) );
244+
v.learn( 'cats are white'.split( /\s+/g ) );
245+
v.learn( 'some rats rats are black'.split( /\s+/g ) );
246+
247+
it( 'doc.out( its.vector ) should return its vector', function () {
248+
expect( v.doc( 2 ).out( its.vector ) ).to.deep.equal( [ 0.048228909, 0.354256208, 0, 0, 0.243258675, 0.354256208, 0 ] );
249+
} );
250+
251+
it( 'out( its.idf ) should return its idfs freq table', function () {
252+
expect( v.out( its.idf ) ).to.deep.equal( idf );
253+
} );
254+
255+
it( 'out( its.terms ) should return its doc terms in alpha sort', function () {
256+
expect( v.out( its.terms ) ).to.deep.equal( terms );
257+
} );
258+
259+
it( 'out( its.docTermMatrix ) should return its doc term matrix', function () {
260+
expect( v.out( its.docTermMatrix ) ).to.deep.equal( dtm );
261+
} );
262+
263+
it( 'should throw error learn() is called after out()', function () {
264+
expect( v.learn.bind( [ 'hello', 'world' ] ) ).to.throw( 'wink-nlp: learn can not be used after a call to out() API in BM25 Vectorizer' );
265+
} );
266+
} );
267+
268+
describe( 'values of TF & IDF', function () {
269+
const v = bm25( { norm: 'l1' } );
270+
// johann: `ln( 1 + ( ( 4 - 2 + 0.5 ) / ( 2 + 0.5 ) ) ) = 0.693147181`
271+
// bach: `ln( 1 + ( ( 4 - 4 + 0.5 ) / ( 4 + 0.5 ) ) ) = 0.105360516`
272+
const model = '{"tf":[{"bach":1},{"j":0.919531173,"bach":0.080468827},{"johann":0.346144285,"s":0.601240713,"bach":0.052615002},{"johann":0.346144285,"sebastian":0.601240713,"bach":0.052615002}],"idf":{"bach":0.105360516,"j":1.203972804,"johann":0.693147181,"s":1.203972804,"sebastian":1.203972804}}';
273+
v.learn( 'Bach'.toLowerCase().split( /\s+/g ) );
274+
v.learn( 'J Bach'.toLowerCase().split( /\s+/g ) );
275+
v.learn( 'Johann S Bach'.toLowerCase().split( /\s+/g ) );
276+
v.learn( 'Johann Sebastian Bach'.toLowerCase().split( /\s+/g ) );
277+
278+
it( 'should return correct idf values', function () {
279+
expect( v.out( its.modelJSON ) ).to.equal( model );
280+
} );
281+
} );
282+
283+
describe( 'completely OOV tokens with l1 norm', function () {
284+
const v = bm25( { norm: 'l1' } );
285+
v.learn( 'Bach'.toLowerCase().split( /\s+/g ) );
286+
v.learn( 'J Bach'.toLowerCase().split( /\s+/g ) );
287+
v.learn( 'Johann S Bach'.toLowerCase().split( /\s+/g ) );
288+
v.learn( 'Johann Sebastian Bach'.toLowerCase().split( /\s+/g ) );
289+
290+
it( 'should return 0-vector', function () {
291+
expect( v.vectorOf([ 'cat', 'cat', 'green', 'is' ] ) ).to.deep.equal( [ 0, 0, 0, 0, 0 ] );
292+
} );
293+
} );
294+
295+
describe( 'completely OOV tokens with l2 norm', function () {
296+
const v = bm25( { norm: 'l2' } );
297+
v.learn( 'Bach'.toLowerCase().split( /\s+/g ) );
298+
v.learn( 'J Bach'.toLowerCase().split( /\s+/g ) );
299+
v.learn( 'Johann S Bach'.toLowerCase().split( /\s+/g ) );
300+
v.learn( 'Johann Sebastian Bach'.toLowerCase().split( /\s+/g ) );
301+
302+
it( 'should return 0-vector', function () {
303+
expect( v.vectorOf([ 'cat', 'cat', 'green', 'is' ] ) ).to.deep.equal( [ 0, 0, 0, 0, 0 ] );
304+
} );
305+
} );
306+
} );

0 commit comments

Comments
 (0)