Skip to content

Commit fb2580d

Browse files
feat(*): add method to extract contextual word vectors from doc
1 parent 5fd4da0 commit fb2580d

File tree

2 files changed

+107
-30
lines changed

2 files changed

+107
-30
lines changed

src/doc-v2.js

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ var itmDocumentOut = require( './api/itm-document-out.js' );
7070
// Print tokens, it is primarily for command line output.
7171
var printTokens = require( './api/print-tokens.js' );
7272

73+
var its = require( './its.js' );
74+
7375
// <hr/>
7476

7577
// # Doc
@@ -129,6 +131,9 @@ var doc = function ( docData, addons ) {
129131
var itemCustomEntity;
130132
var itemSentence;
131133

134+
// Vectors API
135+
var contextualVectors;
136+
132137
// Others.
133138
var isLexeme = cache.lookup;
134139

@@ -444,6 +449,106 @@ var doc = function ( docData, addons ) {
444449

445450
// <hr/>
446451

452+
// ### contextualVectors
453+
/**
454+
*
455+
* Makes a JSON of contextually relevant words in the winkNLP format.
456+
*
457+
* @return {string} containing the JSON.
458+
*/
459+
// eslint-disable-next-line complexity
460+
contextualVectors = function ( { lemma = true, specificWordVectors = [], similarWordVectors = false, wordVectorsLimit = 0 } = {} ) {
461+
// Initialize contextual vectors.
462+
const cv = Object.create( null );
463+
// Following properties are constants, therefore can be directly copied.
464+
cv.precision = docData.wordVectors.precision;
465+
cv.l2NormIndex = docData.wordVectors.l2NormIndex;
466+
cv.wordIndex = docData.wordVectors.wordIndex;
467+
cv.dimensions = docData.wordVectors.dimensions;
468+
cv.unkVector = docData.wordVectors.unkVector;
469+
// Following properties will be determined on the basis of the context.
470+
cv.size = 0;
471+
cv.words = [];
472+
cv.vectors = Object.create( null );
473+
// Shortcut all word vectors.
474+
const awvs = docData.wordVectors.vectors;
475+
476+
// Extract all document's tokens.
477+
const docTokens = colTokens( 0, docData.numOfTokens - 1 )()
478+
.out()
479+
.map( ( t ) => t.toLowerCase() );
480+
let docTokensLemma = [];
481+
if ( lemma === true ) docTokensLemma = colTokens( 0, docData.numOfTokens - 1 )()
482+
.out( its.lemma )
483+
.map( ( t ) => t.toLowerCase() );
484+
485+
for ( let i = 0; i < docTokens.length; i += 1 ) cv.vectors[ docTokens[ i ] ] = awvs[ docTokens[ i ] ] || cv.unkVector;
486+
for ( let i = 0; i < docTokensLemma.length; i += 1 ) cv.vectors[ docTokensLemma[ i ] ] = awvs[ docTokensLemma[ i ] ] || cv.unkVector;
487+
for ( let i = 0; i < specificWordVectors.length; i += 1 ) cv.vectors[ specificWordVectors[ i ] ] = awvs[ specificWordVectors[ i ] ] || cv.unkVector;
488+
489+
if ( similarWordVectors ) {
490+
// Extract similar words on the basis of shortest Manhattan distance.
491+
const allUniqueTokens = Object.keys( cv.vectors );
492+
// Set up similar words array, with the size of all unique tokens.
493+
const similarWords = new Array( allUniqueTokens.length );
494+
// Placeholder for maintaining the similarity score based on Manhattan distance.
495+
const similarWordsScore = new Array( allUniqueTokens.length );
496+
// Initialize to a large distance!
497+
similarWordsScore.fill( 1000000 );
498+
499+
// Initialize contextual vectors size i.e. vocab.
500+
cv.size = allUniqueTokens.length;
501+
502+
// Now search each one of them in the entire word vectors space.
503+
// Keep updating the smallest distance.
504+
for ( let i = 0; i < allUniqueTokens.length; i += 1 ) {
505+
const cwv = cv.vectors[ allUniqueTokens[ i ] ];
506+
507+
for ( const word in awvs ) { // eslint-disable-line guard-for-in
508+
if ( word === allUniqueTokens[ i ] ) continue; // eslint-disable-line no-continue
509+
const wv = awvs[ word ];
510+
let distance = 0;
511+
512+
for ( let k = 0; k < cv.dimensions && distance < similarWordsScore[ i ]; k += 1 ) {
513+
distance += Math.abs( cwv[ k ] - wv[ k ] );
514+
} // Mahattan distance computation loop.
515+
516+
if ( distance < similarWordsScore[ i ] ) {
517+
similarWordsScore[ i ] = distance;
518+
similarWords[ i ] = word;
519+
}
520+
} // Traversing all the word vectors.
521+
} // Traversing all the tokens in the corpus.
522+
523+
// Update contextual vectors using the list of similar words; also update their size.
524+
for ( let i = 0; i < similarWords.length; i += 1 ) {
525+
if ( cv.vectors[ similarWords[ i ] ] === undefined ) {
526+
cv.vectors[ similarWords[ i ] ] = awvs[ similarWords[ i ] ] || cv.unkVector;
527+
cv.size += 1;
528+
}
529+
}
530+
531+
}
532+
533+
// Fill the balance space, if any, on the basis of wordVectorsLimit.
534+
for ( let i = 0; cv.size < wordVectorsLimit; i += 1 ) {
535+
const word = docData.wordVectors.words[ i ];
536+
if ( !cv.vectors[ word ] ) {
537+
cv.vectors[ word ] = awvs[ word ];
538+
cv.size += 1;
539+
}
540+
}
541+
542+
// Sort words on the basis of their usage frequency.
543+
cv.words = Object.keys( cv.vectors )
544+
.map( ( w ) => ( { w: w, i: (cv.vectors[ w ][ cv.wordIndex ] < 0 ) ? Infinity : cv.vectors[ w ][ cv.wordIndex ] } ) )
545+
.sort( (a, b) => a.i - b.i )
546+
.map( ( o ) => o.w );
547+
548+
// Update the word index entry inside every vector.
549+
for ( let i = 0; i < cv.size; i += 1 ) cv.vectors[ cv.words[ i ] ][ cv.wordIndex ] = i;
550+
return JSON.stringify( cv, null, 2 );
551+
}; // contextualVectors()
447552

448553
// Published chainable methods.
449554
methods.entities = colEntities;
@@ -459,6 +564,8 @@ var doc = function ( docData, addons ) {
459564
// Enusre that we make a deep copy of config before returning to avoid corruption!
460565
methods.pipeConfig = () => JSON.parse( JSON.stringify( docData.currPipe ) );
461566

567+
methods.contextualVectors = contextualVectors;
568+
462569
return methods;
463570
};
464571

src/wink-nlp.js

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -454,33 +454,3 @@ var nlp = function ( theModel, pipe, wordVectorsJSON = null ) {
454454
}; // wink
455455

456456
module.exports = nlp;
457-
458-
/* This is a test code, will eventually move to tests.
459-
const model = require( '../test/test-model/model.js' );
460-
const vectors = require( '../test/test-model/languages/cur/models/test-vectors.json' );
461-
const winkNLP = require( '../src/wink-nlp.js' );
462-
const myNLP = winkNLP( model, undefined, vectors );
463-
const its = myNLP.its;
464-
const as = myNLP.as;
465-
466-
const text = 'Obama'; // are eating bananas and zxcv!
467-
468-
const doc = myNLP.readDoc( text );
469-
470-
console.log( doc.tokens().out(its.lemma, as.vector).join( ', ') );
471-
472-
console.log( doc.tokens().filter( ( t ) => t.out().length > 0 ).out(its.lemma, as.vector) );
473-
474-
console.log( myNLP.vectorOf( 'president' ) );
475-
476-
// .filter( (t) => !t.out( its.stopWordFlag ) )
477-
const v1 = myNLP.readDoc( 'The president greets the press in Chicago' ).tokens().out( its.lemma, as.vector );
478-
const v2 = myNLP.readDoc( 'Obama speaks to the media in Illinois' ).tokens().out( its.lemma, as.vector );
479-
480-
let r = 0;
481-
for ( let k = 0; k < 100; k += 1 ) {
482-
r += v1[ k ] * v2[ k ];
483-
}
484-
485-
console.log( +( r / ( v1[ 100 ] * v2[ 100 ]) ).toFixed( 1 ) );
486-
// */

0 commit comments

Comments
 (0)