Skip to content

Commit 9b8aae7

Browse files
feat(wink-nlp): add annotations pipe configuration
1 parent cba21c1 commit 9b8aae7

File tree

1 file changed

+96
-54
lines changed

1 file changed

+96
-54
lines changed

src/wink-nlp.js

Lines changed: 96 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,13 @@ var tkSize = constants.tkSize;
6060
* @private
6161
*
6262
* @param {object} theModel language model.
63+
* @param {string[]} pipe of nlp annotations.
6364
* @returns {object} conatining set of API methods for natural language processing.
6465
* @example
6566
* const nlp = require( 'wink-nlp' );
6667
* var myNLP = nlp();
6768
*/
68-
var nlp = function ( theModel ) {
69+
var nlp = function ( theModel, pipe ) {
6970

7071
var methods = Object.create( null );
7172
// Token Regex; compiled from `model`
@@ -108,6 +109,18 @@ var nlp = function ( theModel ) {
108109
// Used to innstantiate the compiler.
109110
var cerMetaModel;
110111

112+
// Annotation stuff.
113+
var validAnnotations = Object.create( null );
114+
validAnnotations.sbd = true;
115+
validAnnotations.negation = true;
116+
validAnnotations.sentiment = true;
117+
validAnnotations.pos = true;
118+
validAnnotations.ner = true;
119+
validAnnotations.cer = true;
120+
// Current pipe.
121+
var currPipe = Object.create( null );
122+
var onlyTokenization = true;
123+
111124
// Private methods.
112125

113126
// ## load
@@ -229,63 +242,83 @@ var nlp = function ( theModel ) {
229242
// The structure is `[ start, end, negationFlag, sentimentScore ]`.
230243
rdd.document = [ 0, ( rdd.numOfTokens - 1 ), 0, 0 ];
231244

232-
// Map tokens for automata.
233-
var tokens4Automata = mapRawTokens2UIdOfNormal( rdd );
234-
// Sentence Boundary Detection.
235-
// Set first `Pattern Swap (x)` as `null`.
236-
var px = null;
237-
for ( let i = 0; i < sbdAutomata.length; i += 1 ) {
238-
sbdAutomata[ i ].setPatternSwap( px );
239-
// For SBD, all tokens are required to extract preceeding spaces.
240-
px = sbdAutomata[ i ].recognize( tokens4Automata, sbdTransformers[ i ], rdd.tokens );
245+
// Map tokens for automata if there are other annotations to be performed.
246+
var tokens4Automata = ( onlyTokenization ) ? null : mapRawTokens2UIdOfNormal( rdd );
247+
248+
var px;
249+
if ( currPipe.sbd ) {
250+
// Sentence Boundary Detection.
251+
// Set first `Pattern Swap (x)` as `null`.
252+
px = null;
253+
for ( let i = 0; i < sbdAutomata.length; i += 1 ) {
254+
sbdAutomata[ i ].setPatternSwap( px );
255+
// For SBD, all tokens are required to extract preceeding spaces.
256+
px = sbdAutomata[ i ].recognize( tokens4Automata, sbdTransformers[ i ], rdd.tokens );
257+
}
258+
// The structure of sentence is:<br/>
259+
// `[ start, end, negationFlag, sentimentScore ]`
260+
sbdSetter( px, rdd );
261+
// Compute number of sentences!
262+
rdd.numOfSentences = rdd.sentences.length;
263+
} else {
264+
// Setup default sentence as entire document!
265+
rdd.numOfSentences = 1;
266+
rdd.sentences = [ [ 0, ( rdd.numOfTokens - 1 ), 0, 0 ] ];
267+
}
268+
269+
if ( currPipe.ner ) {
270+
// Named entity detection.
271+
px = null;
272+
for ( let i = 0; i < nerAutomata.length; i += 1 ) {
273+
nerAutomata[ i ].setPatternSwap( px );
274+
px = nerAutomata[ i ].recognize( tokens4Automata, nerTransformers[ i ] );
275+
}
276+
// Entities — storted as array of `[ start, end, entity type ].`
277+
// There are no setter for entities as no transformation is needed.
278+
rdd.entities = px;
279+
} else {
280+
rdd.entities = [];
241281
}
242-
// The structure of sentence is:<br/>
243-
// `[ start, end, negationFlag, sentimentScore ]`
244-
sbdSetter( px, rdd );
245-
// Compute number of sentences!
246-
rdd.numOfSentences = rdd.sentences.length;
247-
248-
// Named entity detection.
249-
px = null;
250-
for ( let i = 0; i < nerAutomata.length; i += 1 ) {
251-
nerAutomata[ i ].setPatternSwap( px );
252-
px = nerAutomata[ i ].recognize( tokens4Automata, nerTransformers[ i ] );
282+
283+
if ( currPipe.negation ) {
284+
// Negation
285+
px = null;
286+
px = negAutomata.recognize( tokens4Automata );
287+
negSetter( px, rdd, constants, search );
253288
}
254-
// Entities — storted as array of `[ start, end, entity type ].`
255-
// There are no setter for entities as no transformation is needed.
256-
rdd.entities = px;
257-
258-
// Negation
259-
px = null;
260-
px = negAutomata.recognize( tokens4Automata );
261-
negSetter( px, rdd, constants, search );
262-
263-
// Sentiment Analysis
264-
px = null;
265-
px = saAutomata.recognize( tokens4Automata );
266-
saSetter( px, rdd, constants, locate );
267-
268-
// PoS Tagging
269-
const posTags = mapRawTokens2UIdOfDefaultPOS( rdd );
270-
px = null;
271-
for ( let i = 0; i < posAutomata.length; i += 1 ) {
272-
px = posAutomata[ i ].recognize( posTags, posTransformers[ 0 ], rdd.tokens );
273-
posUpdater( px, cache, posTags, tokens4Automata );
289+
290+
if ( currPipe.sentiment ) {
291+
// Sentiment Analysis
292+
px = null;
293+
px = saAutomata.recognize( tokens4Automata );
294+
saSetter( px, rdd, constants, locate );
274295
}
275-
posSetter( rdd, posTags, tkSize, constants.bits4lemma );
276-
277-
// Patterns
278-
px = null;
279-
if ( cerAutomata !== undefined && cerLearnings > 0 ) {
280-
cerConfig.rdd = rdd;
281-
cerConfig.preserve = cerPreserve;
282-
cerConfig.constants = constants;
283-
if ( cerConfig.useEntity ) cerAutomata.setPatternSwap( rdd.entities );
284-
px = cerAutomata.recognize( tokens4Automata, cerTransformer, cerConfig );
296+
297+
if ( currPipe.pos ) {
298+
// PoS Tagging
299+
const posTags = mapRawTokens2UIdOfDefaultPOS( rdd );
300+
px = null;
301+
for ( let i = 0; i < posAutomata.length; i += 1 ) {
302+
px = posAutomata[ i ].recognize( posTags, posTransformers[ 0 ], rdd.tokens );
303+
posUpdater( px, cache, posTags, tokens4Automata );
304+
}
305+
posSetter( rdd, posTags, tkSize, constants.bits4lemma );
285306
}
286-
// If there are no custom entities, then `px` will be `null`; in such a case
287-
// set `customEntities` to an empty array.
288-
rdd.customEntities = px || [];
307+
308+
if ( currPipe.cer ) {
309+
// Patterns
310+
px = null;
311+
if ( cerAutomata !== undefined && cerLearnings > 0 ) {
312+
cerConfig.rdd = rdd;
313+
cerConfig.preserve = cerPreserve;
314+
cerConfig.constants = constants;
315+
if ( cerConfig.useEntity ) cerAutomata.setPatternSwap( rdd.entities );
316+
px = cerAutomata.recognize( tokens4Automata, cerTransformer, cerConfig );
317+
}
318+
// If there are no custom entities, then `px` will be `null`; in such a case
319+
// set `customEntities` to an empty array.
320+
rdd.customEntities = px || [];
321+
} else rdd.customEntities = [];
289322

290323

291324
// Word Vector
@@ -362,6 +395,15 @@ var nlp = function ( theModel ) {
362395
throw Error( 'wink-nlp: invalid model used.' );
363396
}
364397

398+
const tempPipe = ( pipe === undefined ) ? Object.keys( validAnnotations ) : pipe;
399+
if ( helper.isArray( tempPipe ) ) {
400+
tempPipe.forEach( ( at ) => {
401+
if ( !validAnnotations[ at ] ) throw Error( `wink-nlp: invalid pipe annotation "${at}" found.` );
402+
currPipe[ at ] = true;
403+
onlyTokenization = false;
404+
} );
405+
} else throw Error( `wink-nlp: invalid pipe, it must be an array instead found a "${typeof pipe}".` );
406+
365407
// Load the model.
366408
load();
367409
// Setup default configuration.

0 commit comments

Comments
 (0)