@@ -30,13 +30,14 @@ const TEL_MIN_LENGTH = 6;
3030const DISTANCE_TOLERANCE = 4 ;
3131const MIN_SCORE = 0.05 ;
3232
33+ // TODO: Remove global result
3334// define result template
3435let result = { } ;
3536
36- // *********************************************
37+ // ****************************************************************************
3738// REGEXES
38- // *********************************************
39- let typos = [
39+ // ****************************************************************************
40+ const typos = [
4041 { regex : / [ A - Z a - z ] 0 [ A - Z a - z ] / g, find : "0" , replace : "o" } , // 0 instead of o inside a text
4142 { regex : / [ A - Z a - z ] \| [ A - Z a - z ] / g, find : "|" , replace : "l" } , // pipe for l
4243 { regex : / [ A - Z a - z ] \| 0 [ A - Z a - z ] / g, find : "|0" , replace : "lo" } , // 0 instead of o + pipe and words
@@ -52,13 +53,13 @@ let typos = [
5253] ;
5354
5455// email
55- let email = / ( ( [ ^ < > ( ) \[ \] \\ . , ; : \s @ " ] + ( \. [ ^ < > ( ) \[ \] \\ . , ; : \s @ " ] + ) * ) | ( " .+ " ) ) @ ( ( \[ [ 0 - 9 ] { 1 , 3 } \. [ 0 - 9 ] { 1 , 3 } \. [ 0 - 9 ] { 1 , 3 } \. [ 0 - 9 ] { 1 , 3 } ] ) | ( ( [ a - z A - Z \- 0 - 9 ] + \. ) + [ a - z A - Z ] { 2 , } ) ) / gi;
56+ const email = / ( ( [ ^ < > ( ) \[ \] \\ . , ; : \s @ " ] + ( \. [ ^ < > ( ) \[ \] \\ . , ; : \s @ " ] + ) * ) | ( " .+ " ) ) @ ( ( \[ [ 0 - 9 ] { 1 , 3 } \. [ 0 - 9 ] { 1 , 3 } \. [ 0 - 9 ] { 1 , 3 } \. [ 0 - 9 ] { 1 , 3 } ] ) | ( ( [ a - z A - Z \- 0 - 9 ] + \. ) + [ a - z A - Z ] { 2 , } ) ) / gi;
5657
5758// web
58- let web = / ( h t t p s ? : \/ \/ (?: w w w \. | (? ! w w w ) ) [ a - z A - Z 0 - 9 ] [ a - z A - Z 0 - 9 - ] + [ a - z A - Z 0 - 9 ] \. [ ^ \s ] { 2 , } | w w w \. [ a - z A - Z 0 - 9 ] [ a - z A - Z 0 - 9 - ] + [ a - z A - Z 0 - 9 ] \. [ ^ \s ] { 2 , } | h t t p s ? : \/ \/ (?: w w w \. | (? ! w w w ) ) [ a - z A - Z 0 - 9 ] \. [ ^ \s ] { 2 , } | w w w \. [ a - z A - Z 0 - 9 ] \. [ ^ \s ] { 2 , } ) / gi;
59+ const web = / ( h t t p s ? : \/ \/ (?: w w w \. | (? ! w w w ) ) [ a - z A - Z 0 - 9 ] [ a - z A - Z 0 - 9 - ] + [ a - z A - Z 0 - 9 ] \. [ ^ \s ] { 2 , } | w w w \. [ a - z A - Z 0 - 9 ] [ a - z A - Z 0 - 9 - ] + [ a - z A - Z 0 - 9 ] \. [ ^ \s ] { 2 , } | h t t p s ? : \/ \/ (?: w w w \. | (? ! w w w ) ) [ a - z A - Z 0 - 9 ] \. [ ^ \s ] { 2 , } | w w w \. [ a - z A - Z 0 - 9 ] \. [ ^ \s ] { 2 , } ) / gi;
5960
6061// tel
61- let regex_tel = [
62+ const regex_tel = [
6263 { regex : / ( [ + ] [ 0 - 9 ] { 1 , 4 } \s * ) ? ( \( [ 0 - 9 ] { 1 , 2 } \) \s * ) ? ( [ 0 - 9 ] + [ \s | \\ \/ . - ] ? ) { 3 , } / g, confidence : 0.5 } ,
6364 {
6465 regex : / ( ( t e l | p h o n | d i r ) \w * ( [ . | : ] ) * \s * ) ( [ + ] [ 0 - 9 ] { 1 , 4 } \s * ) ? ( \( [ 0 - 9 ] { 1 , 2 } \) \s * ) ? ( [ 0 - 9 ] + [ \s | \\ \/ . - ] ? ) { 3 , } / g,
@@ -67,16 +68,19 @@ let regex_tel = [
6768] ;
6869
6970// fax
70- let regex_fax = [
71+ const regex_fax = [
7172 {
7273 regex : / ( ( f a x ) \w * ( [ . | : ] ) * \s * ) ( [ + ] [ 0 - 9 ] { 1 , 4 } \s * ) ? ( \( [ 0 - 9 ] { 1 , 2 } \) \s * ) ? ( [ 0 - 9 ] + [ \s | \\ \/ . - ] ? ) { 3 , } / g,
7374 confidence : 0.5
7475 }
7576] ;
7677
7778// mobile
78- let regex_mobile = [
79- { regex : / ( [ + ] [ 0 - 9 ] { 1 , 4 } \s * ) ? ( \( [ 0 - 9 ] { 1 , 2 } \) \s * ) ? ( [ 0 - 9 ] + [ \s | \\ \/ . - ] ? ) { 3 , } / g, confidence : 0.5 } ,
79+ const regex_mobile = [
80+ {
81+ regex : / ( [ + ] [ 0 - 9 ] { 1 , 4 } \s * ) ? ( \( [ 0 - 9 ] { 1 , 2 } \) \s * ) ? ( [ 0 - 9 ] + [ \s | \\ \/ . - ] ? ) { 3 , } / g,
82+ confidence : 0.5
83+ } ,
8084 {
8185 regex : / ( ( m o b i | c e l l | h a n d ) \w * ( [ . | : ] ) * \s * ) ( [ + ] [ 0 - 9 ] { 1 , 4 } \s * ) ? ( \( [ 0 - 9 ] { 1 , 2 } \) \s * ) ? ( [ 0 - 9 ] + [ \s | \\ \/ . - ] ? ) { 3 , } / g,
8286 confidence : 0.5
@@ -151,45 +155,56 @@ function analyzePipeline(ocr) {
151155 result = initializeResult ( ) ;
152156
153157 // Step 0: Break lines from tesseract
158+ console . log ( "Analyze pipeline" , "stage" , 0 , "breakLines" ) ;
154159 ocr = breakLines ( ocr ) ;
155160
156161 // Step 1: Clean text from tesseract
162+ console . log ( "Analyze pipeline" , "stage" , 1 , "cleanText" ) ;
157163 ocr = cleanText ( ocr ) ;
158164
159165 // Step 2: Build logical blocks
166+ console . log ( "Analyze pipeline" , "stage" , 2 , "buildBlocks" ) ;
160167 ocr = bcrBuildBlocks ( ocr ) ;
161168
162169 // Step 3: Score email
170+ console . log ( "Analyze pipeline" , "stage" , 3 , "scoreEmail" ) ;
163171 ocr = scoreEmail ( ocr ) ;
164172
165173 // Step 4: Score web
174+ console . log ( "Analyze pipeline" , "stage" , 4 , "scoreWeb" ) ;
166175 ocr = scoreWeb ( ocr ) ;
167176
168177 // Step 5: Score numbers
178+ console . log ( "Analyze pipeline" , "stage" , 5 , "scoreNumbers" ) ;
169179 ocr = scoreNumbers ( ocr ) ;
170180
171181 // Step 6: Score company
182+ console . log ( "Analyze pipeline" , "stage" , 6 , "scoreCompany" ) ;
172183 ocr = scoreCompany ( ocr ) ;
173184
174185 // Step 7: Score name
186+ console . log ( "Analyze pipeline" , "stage" , 7 , "scoreName" ) ;
175187 ocr = scoreName ( ocr ) ;
176188
177189 // Step 8: Score job
190+ console . log ( "Analyze pipeline" , "stage" , 8 , "scoreJob" ) ;
178191 ocr = scoreJob ( ocr ) ;
179192
180193 // Step 9: Score address
194+ console . log ( "Analyze pipeline" , "stage" , 9 , "scoreAddress" ) ;
181195 ocr = scoreAddress ( ocr ) ;
182196
183197 // Step 10: Assign result
198+ console . log ( "Analyze pipeline" , "stage" , 10 , "assignResult" ) ;
184199 assignResults ( ocr ) ;
185200
186201 // return result
187202 return result ;
188203}
189204
190- // *********************************************************************
191- // PREPROCESS
192- // *********************************************************************
205+ // ****************************************************************************
206+ // PREPROCESS OCR Object
207+ // ****************************************************************************
193208
194209// break long line
195210function breakLines ( ocr ) {
@@ -413,9 +428,9 @@ function cleanText(ocr) {
413428 return ocr ;
414429}
415430
416- // *********************************************************************
431+ // ****************************************************************************
417432// UTILITIES
418- // *********************************************************************
433+ // ****************************************************************************
419434
420435// get font distance
421436function getFontBiggerRatio ( average , real ) {
@@ -442,7 +457,6 @@ function bcrGetWordsFont(words) {
442457 return fontSize ;
443458}
444459
445- /*
446460// get average font size of words
447461function bcrGetWordsBold ( words ) {
448462
@@ -454,16 +468,15 @@ function bcrGetWordsBold(words) {
454468 }
455469 return words . length / 2 < fontBold ;
456470}
457- */
458471
459472// check regexp
460473function checkRE ( re , st ) {
461474 return String ( st ) . toLowerCase ( ) . match ( re ) || [ ] ;
462475}
463476
464- // *********************************************************************
477+ // ****************************************************************************
465478// EXTARCT VALUE FROM BLOCK
466- // *********************************************************************
479+ // ****************************************************************************
467480
468481// extract web from candidate
469482function extractWeb ( text ) {
@@ -569,16 +582,13 @@ function extractZip(text) {
569582
570583// extract address street
571584function extractStreet ( text ) {
572-
573585 let txt = text . toLowerCase ( ) ;
574586 for ( let j = 0 ; j < streetsDS . length ; j ++ ) {
575587 let re = streetsDS [ j ] ;
576-
577588 if ( checkRE ( re , txt ) . length > 0 ) {
578589 return txt ;
579590 }
580591 }
581-
582592 return "" ;
583593}
584594
@@ -621,9 +631,9 @@ function splitName(text) {
621631 return result ;
622632}
623633
624- // *********************************************************************
634+ // ****************************************************************************
625635// SCORES BLOCKS
626- // *********************************************************************
636+ // ****************************************************************************
627637
628638// score email (strategies: regex, @)
629639function scoreEmail ( ocr ) {
@@ -711,7 +721,7 @@ function scoreCompany(ocr) {
711721 website = website . substr ( 0 , website . lastIndexOf ( "." ) ) ;
712722 }
713723 website = website . toLowerCase ( ) ;
714- if ( typeof website !== undefined && website . length > 0 )
724+ if ( typeof website !== " undefined" && website . length > 0 )
715725 keywords [ website ] = website ;
716726 }
717727 }
@@ -723,7 +733,7 @@ function scoreCompany(ocr) {
723733 email = email . substr ( email . indexOf ( "@" ) + 1 ) ;
724734 email = email . substr ( 0 , email . indexOf ( "." ) ) ;
725735 email = email . toLowerCase ( ) ;
726- if ( typeof email !== undefined && email . length > 0 )
736+ if ( typeof email !== " undefined" && email . length > 0 )
727737 keywords [ email ] = email ;
728738 }
729739 }
@@ -734,9 +744,7 @@ function scoreCompany(ocr) {
734744 for ( let i = 0 ; i < ocr . BCR . blocks . length ; i ++ ) {
735745 if ( ocr . BCR . blocks [ i ] . fields . web === 0 && ocr . BCR . blocks [ i ] . fields . email === 0 ) {
736746 let word = ocr . BCR . blocks [ i ] . text . toLowerCase ( ) ;
737- let keys = Object . keys ( keywords ) ;
738- for ( let k in keys ) {
739-
747+ Object . keys ( keywords ) . forEach ( k => {
740748 // calculate similarity
741749 let sim = sSimilarity ( word , k ) ;
742750
@@ -747,7 +755,7 @@ function scoreCompany(ocr) {
747755 }
748756 // remaining 0.2, assigned by font criteria
749757 ocr . BCR . blocks [ i ] . fields . company += getFontBiggerRatio ( ocr . BCR . averageFontSize , ocr . BCR . blocks [ i ] . fontSize ) * 0.2 ;
750- }
758+ } ) ;
751759 }
752760 }
753761
@@ -768,7 +776,7 @@ function scoreName(ocr) {
768776 let nick = email . substr ( 0 , email . indexOf ( "@" ) ) ;
769777 nick = nick . replace ( new RegExp ( "\\." , 'g' ) , " " ) ;
770778
771- if ( typeof email !== undefined && email . length > 0 )
779+ if ( typeof email !== " undefined" && email . length > 0 )
772780 keywords . push ( nick ) ;
773781 }
774782 }
@@ -778,8 +786,7 @@ function scoreName(ocr) {
778786 for ( let i = 0 ; i < ocr . BCR . blocks . length ; i ++ ) {
779787 if ( ocr . BCR . blocks [ i ] . fields . email === 0 ) {
780788 let word = ocr . BCR . blocks [ i ] . text . toLowerCase ( ) ;
781- for ( let k in keywords ) {
782-
789+ for ( let k = 0 ; k < keywords . length ; k ++ ) {
783790 // calculate similarity
784791 let sim = sSimilarity ( word , keywords [ k ] ) ;
785792
@@ -792,10 +799,11 @@ function scoreName(ocr) {
792799 ocr . BCR . blocks [ i ] . fields . name += getFontBiggerRatio ( ocr . BCR . averageFontSize , ocr . BCR . blocks [ i ] . fontSize ) * 0.2 ;
793800
794801 }
802+
795803 }
796804 }
797805
798- // contribute max 0.3, assigned by dataset
806+ // contribute max 0.3, assigned by dataset
799807 for ( let i = 0 ; i < ocr . BCR . blocks . length ; i ++ ) {
800808 if ( ocr . BCR . blocks [ i ] . fields . email === 0 ) {
801809 let line = ocr . BCR . blocks [ i ] . text . toLowerCase ( ) ;
@@ -938,9 +946,9 @@ function scoreAddress(ocr) {
938946 return ocr ;
939947}
940948
941- // *********************************************************************
949+ // ****************************************************************************
942950// Assign results
943- // *********************************************************************
951+ // ****************************************************************************
944952function assignResults ( ocr ) {
945953
946954 let web = [ ] ;
0 commit comments