@@ -70,6 +70,8 @@ var itmDocumentOut = require( './api/itm-document-out.js' );
7070// Print tokens, it is primarily for command line output.
7171var printTokens = require ( './api/print-tokens.js' ) ;
7272
73+ var its = require ( './its.js' ) ;
74+
7375// <hr/>
7476
7577// # Doc
@@ -129,6 +131,9 @@ var doc = function ( docData, addons ) {
129131 var itemCustomEntity ;
130132 var itemSentence ;
131133
134+ // Vectors API
135+ var contextualVectors ;
136+
132137 // Others.
133138 var isLexeme = cache . lookup ;
134139
@@ -444,6 +449,106 @@ var doc = function ( docData, addons ) {
444449
445450 // <hr/>
446451
452+ // ### contextualVectors
453+ /**
454+ *
455+ * Makes a JSON of contextually relevant words in the winkNLP format.
456+ *
457+ * @return {string } containing the JSON.
458+ */
459+ // eslint-disable-next-line complexity
460+ contextualVectors = function ( { lemma = true , specificWordVectors = [ ] , similarWordVectors = false , wordVectorsLimit = 0 } = { } ) {
461+ // Initialize contextual vectors.
462+ const cv = Object . create ( null ) ;
463+ // Following properties are constants, therefore can be directly copied.
464+ cv . precision = docData . wordVectors . precision ;
465+ cv . l2NormIndex = docData . wordVectors . l2NormIndex ;
466+ cv . wordIndex = docData . wordVectors . wordIndex ;
467+ cv . dimensions = docData . wordVectors . dimensions ;
468+ cv . unkVector = docData . wordVectors . unkVector ;
469+ // Following properties will be determined on the basis of the context.
470+ cv . size = 0 ;
471+ cv . words = [ ] ;
472+ cv . vectors = Object . create ( null ) ;
473+ // Shortcut all word vectors.
474+ const awvs = docData . wordVectors . vectors ;
475+
476+ // Extract all document's tokens.
477+ const docTokens = colTokens ( 0 , docData . numOfTokens - 1 ) ( )
478+ . out ( )
479+ . map ( ( t ) => t . toLowerCase ( ) ) ;
480+ let docTokensLemma = [ ] ;
481+ if ( lemma === true ) docTokensLemma = colTokens ( 0 , docData . numOfTokens - 1 ) ( )
482+ . out ( its . lemma )
483+ . map ( ( t ) => t . toLowerCase ( ) ) ;
484+
485+ for ( let i = 0 ; i < docTokens . length ; i += 1 ) cv . vectors [ docTokens [ i ] ] = awvs [ docTokens [ i ] ] || cv . unkVector ;
486+ for ( let i = 0 ; i < docTokensLemma . length ; i += 1 ) cv . vectors [ docTokensLemma [ i ] ] = awvs [ docTokensLemma [ i ] ] || cv . unkVector ;
487+ for ( let i = 0 ; i < specificWordVectors . length ; i += 1 ) cv . vectors [ specificWordVectors [ i ] ] = awvs [ specificWordVectors [ i ] ] || cv . unkVector ;
488+
489+ if ( similarWordVectors ) {
490+ // Extract similar words on the basis of shortest Manhattan distance.
491+ const allUniqueTokens = Object . keys ( cv . vectors ) ;
492+ // Set up similar words array, with the size of all unique tokens.
493+ const similarWords = new Array ( allUniqueTokens . length ) ;
494+ // Placeholder for maintaining the similarity score based on Manhattan distance.
495+ const similarWordsScore = new Array ( allUniqueTokens . length ) ;
496+ // Initialize to a large distance!
497+ similarWordsScore . fill ( 1000000 ) ;
498+
499+ // Initialize contextual vectors size i.e. vocab.
500+ cv . size = allUniqueTokens . length ;
501+
502+ // Now search each one of them in the entire word vectors space.
503+ // Keep updating the smallest distance.
504+ for ( let i = 0 ; i < allUniqueTokens . length ; i += 1 ) {
505+ const cwv = cv . vectors [ allUniqueTokens [ i ] ] ;
506+
507+ for ( const word in awvs ) { // eslint-disable-line guard-for-in
508+ if ( word === allUniqueTokens [ i ] ) continue ; // eslint-disable-line no-continue
509+ const wv = awvs [ word ] ;
510+ let distance = 0 ;
511+
512+ for ( let k = 0 ; k < cv . dimensions && distance < similarWordsScore [ i ] ; k += 1 ) {
513+ distance += Math . abs ( cwv [ k ] - wv [ k ] ) ;
514+ } // Mahattan distance computation loop.
515+
516+ if ( distance < similarWordsScore [ i ] ) {
517+ similarWordsScore [ i ] = distance ;
518+ similarWords [ i ] = word ;
519+ }
520+ } // Traversing all the word vectors.
521+ } // Traversing all the tokens in the corpus.
522+
523+ // Update contextual vectors using the list of similar words; also update their size.
524+ for ( let i = 0 ; i < similarWords . length ; i += 1 ) {
525+ if ( cv . vectors [ similarWords [ i ] ] === undefined ) {
526+ cv . vectors [ similarWords [ i ] ] = awvs [ similarWords [ i ] ] || cv . unkVector ;
527+ cv . size += 1 ;
528+ }
529+ }
530+
531+ }
532+
533+ // Fill the balance space, if any, on the basis of wordVectorsLimit.
534+ for ( let i = 0 ; cv . size < wordVectorsLimit ; i += 1 ) {
535+ const word = docData . wordVectors . words [ i ] ;
536+ if ( ! cv . vectors [ word ] ) {
537+ cv . vectors [ word ] = awvs [ word ] ;
538+ cv . size += 1 ;
539+ }
540+ }
541+
542+ // Sort words on the basis of their usage frequency.
543+ cv . words = Object . keys ( cv . vectors )
544+ . map ( ( w ) => ( { w : w , i : ( cv . vectors [ w ] [ cv . wordIndex ] < 0 ) ? Infinity : cv . vectors [ w ] [ cv . wordIndex ] } ) )
545+ . sort ( ( a , b ) => a . i - b . i )
546+ . map ( ( o ) => o . w ) ;
547+
548+ // Update the word index entry inside every vector.
549+ for ( let i = 0 ; i < cv . size ; i += 1 ) cv . vectors [ cv . words [ i ] ] [ cv . wordIndex ] = i ;
550+ return JSON . stringify ( cv , null , 2 ) ;
551+ } ; // contextualVectors()
447552
448553 // Published chainable methods.
449554 methods . entities = colEntities ;
@@ -459,6 +564,8 @@ var doc = function ( docData, addons ) {
459564 // Enusre that we make a deep copy of config before returning to avoid corruption!
460565 methods . pipeConfig = ( ) => JSON . parse ( JSON . stringify ( docData . currPipe ) ) ;
461566
567+ methods . contextualVectors = contextualVectors ;
568+
462569 return methods ;
463570} ;
464571
0 commit comments