@@ -360,24 +360,24 @@ void create_mblem_trainfile( const mblem_data& data,
360360 exit ( EXIT_FAILURE );
361361 }
362362 UnicodeString outLine;
363- for ( const auto & data_it : data ){
364- UnicodeString wordform = data_it. first ;
363+ // data is a multimap of Words to a map of lemmas to a frequency list of POS tags.
364+ for ( const auto & [ wordform,lemma_map] : data ){
365365 UnicodeString safeInstance;
366366 if ( !outLine.isEmpty () ){
367367 string out = UnicodeToUTF8 (outLine);
368- out.erase ( out. length ()- 1 ); // remove the final '|'
368+ out.pop_back ( ); // remove the final '|'
369369 os << out << endl;
370370 outLine.remove ();
371371 }
372372 UnicodeString instance;
373373 // format instance
374374 for ( int i=0 ; i<HISTORY; i++) {
375375 int j= wordform.length ()-HISTORY+i;
376- if (j<0 )
376+ if ( j<0 ) {
377377 instance += " = " ;
378+ }
378379 else {
379- UChar uc = wordform[j];
380- instance += uc;
380+ instance += wordform[j];
381381 instance += " " ;
382382 }
383383 }
@@ -396,15 +396,15 @@ void create_mblem_trainfile( const mblem_data& data,
396396 << " to " << instance << endl;
397397 }
398398 string out = UnicodeToUTF8 (outLine);
399- out.erase ( out. length ()- 1 );
399+ out.pop_back ( );
400400 os << out << endl;
401401 safeInstance = instance;
402402 outLine = instance;
403403 }
404404 multimap<size_t , multimap<UnicodeString,UnicodeString>,std::greater<size_t >> rev_sorted;
405- // data is a multimap of Words to a map of lemmas to a frequency list of POS tags .
406- // rev_sorted is a multimap of counts to a multimap of tag/lemmas names.
407- for ( const auto & [lemma, tag_map] : data_it. second ){
405+ // rev_sorted is a multimap of counts to a multimap of tag/ lemmas names .
406+ // highest counts first
407+ for ( const auto & [lemma, tag_map] : lemma_map ){
408408 for ( const auto & [tag,count] : tag_map ){
409409 multimap<UnicodeString,UnicodeString> mm;
410410 mm.insert (make_pair (tag,lemma));
@@ -417,8 +417,8 @@ void create_mblem_trainfile( const mblem_data& data,
417417 cerr << mmap << " (" << count << " )" << endl;
418418 }
419419 }
420- for ( const auto & it2 : rev_sorted ){
421- for ( const auto & [tag,lemma] : it2. second ){
420+ for ( const auto & [dummy,tag_lemma_map] : rev_sorted ){
421+ for ( const auto & [tag,lemma] : tag_lemma_map ){
422422 if ( debug ){
423423 cerr << " LEMMA = " << lemma << endl;
424424 cerr << " tag = " << tag << endl;
@@ -427,14 +427,14 @@ void create_mblem_trainfile( const mblem_data& data,
427427 UnicodeString prefixed;
428428 UnicodeString thisform = wordform;
429429 // find out whether there may be a prefix or infix particle
430- for ( const auto & it : particles ){
430+ for ( const auto & [seek_tag,parts] : particles ){
431431 if ( !prefixed.isEmpty () ){
432432 break ;
433433 }
434434 thisform = wordform;
435- if ( tag.indexOf (it. first ) >= 0 ){
435+ if ( tag.indexOf (seek_tag ) >= 0 ){
436436 // the POS tag matches, so potentially yes
437- for ( const auto & part : it. second ){
437+ for ( const auto & part : parts ){
438438 // loop over potential particles.
439439 int part_pos = thisform.indexOf (part);
440440 if ( part_pos != -1 ){
@@ -463,11 +463,12 @@ void create_mblem_trainfile( const mblem_data& data,
463463 ( edit[ident]==lemma[ident] ) ){
464464 ident++;
465465 }
466- if (ident<5 ) {
466+ if ( ident<5 ) {
467467 // so we want at least 5 characters in common between lemma and our
468468 // edit. Otherwise discard.
469- if ( debug )
469+ if ( debug ){
470470 cerr << " must be a fake!" << endl;
471+ }
471472 prefixed = " " ;
472473 }
473474 else {
@@ -489,8 +490,9 @@ void create_mblem_trainfile( const mblem_data& data,
489490 int ident=0 ;
490491 while ( ident < thisform.length () &&
491492 ident < lemma.length () &&
492- thisform[ident]==lemma[ident] )
493+ thisform[ident]==lemma[ident] ){
493494 ident++;
495+ }
494496 if ( ident < thisform.length () ) {
495497 for ( int i=ident; i< thisform.length (); i++) {
496498 deleted += thisform[i];
@@ -507,19 +509,22 @@ void create_mblem_trainfile( const mblem_data& data,
507509 << " , insert " << inserted
508510 << " , delete " << deleted << endl;
509511 }
510- if ( !prefixed.isEmpty () )
512+ if ( !prefixed.isEmpty () ){
511513 outLine += " +P" + prefixed;
512- if ( !deleted.isEmpty () )
514+ }
515+ if ( !deleted.isEmpty () ){
513516 outLine += " +D" + deleted;
514- if ( !inserted.isEmpty () )
517+ }
518+ if ( !inserted.isEmpty () ){
515519 outLine += " +I" + inserted;
520+ }
516521 outLine += " |" ;
517522 }
518523 }
519524 }
520525 if ( !outLine.isEmpty () ){
521526 string out = UnicodeToUTF8 (outLine);
522- out.erase ( out. length ()- 1 );
527+ out.pop_back ( );
523528 os << out << endl;
524529 outLine.remove ();
525530 }
@@ -557,11 +562,11 @@ void create_lemmatizer( const Configuration& config,
557562
558563void check_data ( Tokenizer::TokenizerClass *tokenizer,
559564 const mblem_data& data ){
560- for ( const auto & word : data ){
561- tokenizer->tokenizeLine ( word. first );
565+ for ( const auto & [ word,dummy] : data ){
566+ tokenizer->tokenizeLine ( word );
562567 vector<Tokenizer::Token> v = tokenizer->popSentence ();
563568 if ( v.size () != 1 ){
564- cerr << " the provided tokenizer doesn't handle '" << word. first
569+ cerr << " the provided tokenizer doesn't handle '" << word
565570 << " ' well (splits it into " << v.size () << " parts.)" << endl;
566571 cerr << " [" ;
567572 for ( const auto & w : v ){
0 commit comments