Skip to content

Commit 5c9c593

Browse files
committed
refactoring
1 parent 22e8a4c commit 5c9c593

File tree

1 file changed

+30
-25
lines changed

1 file changed

+30
-25
lines changed

src/froggen.cxx

Lines changed: 30 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -360,24 +360,24 @@ void create_mblem_trainfile( const mblem_data& data,
360360
exit( EXIT_FAILURE );
361361
}
362362
UnicodeString outLine;
363-
for ( const auto& data_it : data ){
364-
UnicodeString wordform = data_it.first;
363+
// data is a multimap of Words to a map of lemmas to a frequency list of POS tags.
364+
for ( const auto& [wordform,lemma_map] : data ){
365365
UnicodeString safeInstance;
366366
if ( !outLine.isEmpty() ){
367367
string out = UnicodeToUTF8(outLine);
368-
out.erase( out.length()-1 ); // remove the final '|'
368+
out.pop_back(); // remove the final '|'
369369
os << out << endl;
370370
outLine.remove();
371371
}
372372
UnicodeString instance;
373373
// format instance
374374
for ( int i=0; i<HISTORY; i++) {
375375
int j= wordform.length()-HISTORY+i;
376-
if (j<0)
376+
if ( j<0 ) {
377377
instance += "= ";
378+
}
378379
else {
379-
UChar uc = wordform[j];
380-
instance += uc;
380+
instance += wordform[j];
381381
instance += " ";
382382
}
383383
}
@@ -396,15 +396,15 @@ void create_mblem_trainfile( const mblem_data& data,
396396
<< "to " << instance << endl;
397397
}
398398
string out = UnicodeToUTF8(outLine);
399-
out.erase( out.length()-1 );
399+
out.pop_back();
400400
os << out << endl;
401401
safeInstance = instance;
402402
outLine = instance;
403403
}
404404
multimap<size_t, multimap<UnicodeString,UnicodeString>,std::greater<size_t>> rev_sorted;
405-
// data is a multimap of Words to a map of lemmas to a frequency list of POS tags.
406-
// rev_sorted is a multimap of counts to a multimap of tag/lemmas names.
407-
for ( const auto& [lemma, tag_map] : data_it.second ){
405+
// rev_sorted is a multimap of counts to a multimap of tag/lemmas names.
406+
// highest counts first
407+
for ( const auto& [lemma, tag_map] : lemma_map ){
408408
for ( const auto& [tag,count] : tag_map ){
409409
multimap<UnicodeString,UnicodeString> mm;
410410
mm.insert(make_pair(tag,lemma));
@@ -417,8 +417,8 @@ void create_mblem_trainfile( const mblem_data& data,
417417
cerr << mmap << " (" << count << " )" << endl;
418418
}
419419
}
420-
for ( const auto& it2 : rev_sorted ){
421-
for( const auto& [tag,lemma] : it2.second ){
420+
for ( const auto& [dummy,tag_lemma_map] : rev_sorted ){
421+
for( const auto& [tag,lemma] : tag_lemma_map ){
422422
if ( debug ){
423423
cerr << "LEMMA = " << lemma << endl;
424424
cerr << "tag = " << tag << endl;
@@ -427,14 +427,14 @@ void create_mblem_trainfile( const mblem_data& data,
427427
UnicodeString prefixed;
428428
UnicodeString thisform = wordform;
429429
// find out whether there may be a prefix or infix particle
430-
for( const auto& it : particles ){
430+
for( const auto& [seek_tag,parts] : particles ){
431431
if ( !prefixed.isEmpty() ){
432432
break;
433433
}
434434
thisform = wordform;
435-
if ( tag.indexOf(it.first) >= 0 ){
435+
if ( tag.indexOf(seek_tag) >= 0 ){
436436
// the POS tag matches, so potentially yes
437-
for ( const auto& part : it.second ){
437+
for ( const auto& part : parts ){
438438
// loop over potential particles.
439439
int part_pos = thisform.indexOf(part);
440440
if ( part_pos != -1 ){
@@ -463,11 +463,12 @@ void create_mblem_trainfile( const mblem_data& data,
463463
( edit[ident]==lemma[ident] ) ){
464464
ident++;
465465
}
466-
if (ident<5) {
466+
if ( ident<5 ) {
467467
// so we want at least 5 characters in common between lemma and our
468468
// edit. Otherwise discard.
469-
if ( debug )
469+
if ( debug ){
470470
cerr << " must be a fake!" << endl;
471+
}
471472
prefixed = "";
472473
}
473474
else {
@@ -489,8 +490,9 @@ void create_mblem_trainfile( const mblem_data& data,
489490
int ident=0;
490491
while ( ident < thisform.length() &&
491492
ident < lemma.length() &&
492-
thisform[ident]==lemma[ident] )
493+
thisform[ident]==lemma[ident] ){
493494
ident++;
495+
}
494496
if ( ident < thisform.length() ) {
495497
for ( int i=ident; i< thisform.length(); i++) {
496498
deleted += thisform[i];
@@ -507,19 +509,22 @@ void create_mblem_trainfile( const mblem_data& data,
507509
<< ", insert " << inserted
508510
<< ", delete " << deleted << endl;
509511
}
510-
if ( !prefixed.isEmpty() )
512+
if ( !prefixed.isEmpty() ){
511513
outLine += "+P" + prefixed;
512-
if ( !deleted.isEmpty() )
514+
}
515+
if ( !deleted.isEmpty() ){
513516
outLine += "+D" + deleted;
514-
if ( !inserted.isEmpty() )
517+
}
518+
if ( !inserted.isEmpty() ){
515519
outLine += "+I" + inserted;
520+
}
516521
outLine += "|";
517522
}
518523
}
519524
}
520525
if ( !outLine.isEmpty() ){
521526
string out = UnicodeToUTF8(outLine);
522-
out.erase( out.length()-1 );
527+
out.pop_back();
523528
os << out << endl;
524529
outLine.remove();
525530
}
@@ -557,11 +562,11 @@ void create_lemmatizer( const Configuration& config,
557562

558563
void check_data( Tokenizer::TokenizerClass *tokenizer,
559564
const mblem_data& data ){
560-
for ( const auto& word : data ){
561-
tokenizer->tokenizeLine( word.first );
565+
for ( const auto& [word,dummy] : data ){
566+
tokenizer->tokenizeLine( word );
562567
vector<Tokenizer::Token> v = tokenizer->popSentence();
563568
if ( v.size() != 1 ){
564-
cerr << "the provided tokenizer doesn't handle '" << word.first
569+
cerr << "the provided tokenizer doesn't handle '" << word
565570
<< "' well (splits it into " << v.size() << " parts.)" << endl;
566571
cerr << "[";
567572
for ( const auto& w : v ){

0 commit comments

Comments
 (0)