Skip to content

Commit 77d7c9d

Browse files
feat(*): enhance its/as helpers for vectorizer
1 parent e9e89ea commit 77d7c9d

File tree

3 files changed

+65
-1
lines changed

3 files changed

+65
-1
lines changed

src/as.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
//
3232

33+
var sort4FT = require( './sort4FT.js' );
3334
var containedMarkings = require( './contained-markings.js' );
3435
var as = Object.create( null );
3536

@@ -88,7 +89,7 @@ as.freqTable = function ( tokens ) {
8889
table[ i ] = [ keys[ i ], bow[ keys[ i ] ] ];
8990
}
9091

91-
return table.sort( ( a, b ) => ( b[ 1 ] - a[ 1 ] ) );
92+
return table.sort( sort4FT );
9293
}; // freqTable()
9394

9495
// ### bigrams

src/its.js

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030

3131
//
3232

33+
var sort4FT = require( './sort4FT.js' );
3334
var constants = require( './constants.js' );
3435
var caseMap = [ 'other', 'lowerCase', 'upperCase', 'titleCase' ];
3536

@@ -138,5 +139,51 @@ its.sentiment = function ( spanItem ) {
138139
return spanItem[ 3 ];
139140
}; // span()
140141

142+
/* ------ utilities ------ */
143+
144+
its.terms = function ( tf, idf, terms ) {
145+
return terms;
146+
}; // terms()
147+
148+
its.docTermMatrix = function ( tf, idf, terms ) {
149+
const dtm = new Array( tf.length );
150+
for ( let id = 0; id < tf.length; id += 1 ) {
151+
dtm[ id ] = [];
152+
for ( let i = 0; i < terms.length; i += 1 ) {
153+
dtm[ id ].push( tf[ id ][ terms[ i ] ] || 0 );
154+
}
155+
}
156+
return dtm;
157+
}; // getDocTermMatrix()
158+
159+
its.docBOWArray = function ( tf ) {
160+
return tf;
161+
}; // docBOWArray()
162+
163+
its.bow = function ( tf ) {
164+
return tf;
165+
}; // bow()
166+
167+
its.idf = function ( tf, idf ) {
168+
var arr = [];
169+
for ( const t in idf ) { // eslint-disable-line guard-for-in
170+
arr.push( [ t, idf[ t ] ] );
171+
}
172+
// Sort on frequency followed by the term.
173+
return arr.sort( sort4FT );
174+
}; // idf()
175+
176+
its.tf = function ( tf ) {
177+
const arr = [];
178+
for ( const t in tf ) { // eslint-disable-line guard-for-in
179+
arr.push( [ t, tf[ t ] ] );
180+
}
181+
// Sort on frequency followed by the term.
182+
return arr.sort( sort4FT );
183+
}; // tf()
184+
185+
its.modelJSON = function ( tf, idf ) {
186+
return JSON.stringify( { tf: tf, idf: idf } );
187+
}; // model()
141188

142189
module.exports = its;

src/sort4FT.js

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
/**
2+
* Stable sort function for frequency table i.e. `[ [ term, frequency ] ... ]`.
3+
* It first sorts on the frequency and then an alpha-numeric sort on term.
4+
*
5+
* @param {array} a first term-frequency pair element sent by sort.
6+
* @param {array} b second term-frequency pair element sent by sort.
7+
* @return {number} number: -1 or 0 or +1.
8+
*/
9+
module.exports = ( a, b ) => {
10+
if ( b[ 1 ] > a[ 1 ] ) {
11+
return 1;
12+
} else if ( b[ 1 ] < a[ 1 ] ) {
13+
return -1;
14+
} else if ( a[ 0 ] > b[ 0 ] ) return 1;
15+
return -1;
16+
};

0 commit comments

Comments
 (0)