@@ -1433,19 +1433,45 @@ struct SplitCode {
14331433 std::string line;
14341434 bool header_read = false ;
14351435 std::vector<std::string> h;
1436+ bool _keep = false ;
1437+ bool _keep_grp = false ;
1438+ bool _remove = false ;
1439+ bool _remove_grp = false ;
14361440 while (std::getline (cfile,line)) {
14371441 if (line.size () == 0 ) {
1442+ _keep = false ;
1443+ _keep_grp = false ;
1444+ _remove = false ;
1445+ _remove_grp = false ;
14381446 continue ;
14391447 }
14401448 if (line[0 ] == ' #' ) {
14411449 continue ;
14421450 }
1443- if (line[0 ] == ' @' ) {
1451+ if (line[0 ] == ' @' || (_keep || _keep_grp || _remove || _remove_grp) ) {
14441452 std::stringstream ss (line);
14451453 std::string field;
14461454 std::string value;
14471455 ss >> field >> value;
1448- if (field == " @qtrim-5" ) {
1456+ if (_keep || _keep_grp || _remove || _remove_grp) { // Read continuous multi-line value (until an empty line)
1457+ std::string sline = field + " " + value;
1458+ while (ss >> value) {
1459+ sline = " " + value;
1460+ }
1461+ value = sline + " \n " ;
1462+ if (_keep) _keep_str += value;
1463+ if (_keep_grp) _keep_grp_str += value;
1464+ if (_remove) _remove_str += value;
1465+ if (_remove_grp) _remove_grp_str += value;
1466+ } else if (field == " @keep:" ) {
1467+ _keep = true ;
1468+ } else if (field == " @keep-grp:" ) {
1469+ _keep_grp = true ;
1470+ } else if (field == " @remove:" ) {
1471+ _remove = true ;
1472+ } else if (field == " @remove-grp:" ) {
1473+ _remove_grp = true ;
1474+ } else if (field == " @qtrim-5" ) {
14491475 this ->quality_trimming_5 = true ;
14501476 } else if (field == " @qtrim-3" ) {
14511477 this ->quality_trimming_3 = true ;
@@ -1643,6 +1669,13 @@ struct SplitCode {
16431669 }
16441670 }
16451671 }
1672+
1673+ // Do some final processing: i.e. if the keep/discard text corpus were provided in the config file, process them now
1674+ if (!_keep_str.empty ()) addFilterList (_keep_str, false , true );
1675+ if (!_remove_str.empty ()) addFilterList (_remove_str, true , true );
1676+ if (!_keep_grp_str.empty ()) addFilterListGroup (_keep_grp_str, false , true );
1677+ if (!_remove_grp_str.empty ()) addFilterListGroup (_remove_grp_str, true , true );
1678+
16461679 checkInit ();
16471680 return true ;
16481681 }
@@ -2096,16 +2129,25 @@ struct SplitCode {
20962129 return true ;
20972130 }
20982131
2099- bool addFilterList (std::string keep_file, bool discard=false ) {
2132+ bool addFilterList (std::string keep_file, bool discard=false , bool is_text_corpus = false ) {
21002133 struct stat stFileInfo;
21012134 auto intstat = stat (keep_file.c_str (), &stFileInfo);
2102- if (intstat != 0 ) {
2103- std::cerr << " Error: file not found " << keep_file << std::endl;
2104- return false ;
2135+ if (!is_text_corpus) {
2136+ if (intstat != 0 ) {
2137+ std::cerr << " Error: file not found " << keep_file << std::endl;
2138+ return false ;
2139+ }
21052140 }
21062141 std::ifstream kfile (keep_file);
2142+ std::istringstream textStream (keep_file);
2143+
2144+ if (is_text_corpus) return processFilterList (textStream, discard, " " );
2145+ else return processFilterList (kfile, discard, keep_file);
2146+ }
2147+
2148+ bool processFilterList (std::istream& input, bool discard, std::string keep_file) {
21072149 std::string line;
2108- while (std::getline (kfile ,line)) {
2150+ while (std::getline (input ,line)) {
21092151 std::string ofile = " " ;
21102152 if (line.size () == 0 ) {
21112153 continue ;
@@ -2129,18 +2171,21 @@ struct SplitCode {
21292171 }
21302172 const auto & itnames = std::find (names.begin (), names.end (), name);
21312173 if (itnames == names.end ()) {
2132- std::cerr << " Error: File " << keep_file << " contains the name \" " << name << " \" which does not exist" << std::endl;
2174+ if (!keep_file.empty ()) std::cerr << " Error: File " << keep_file << " contains the name \" " << name << " \" which does not exist" << std::endl;
2175+ else std::cerr << " Name \" " << name << " \" does not exist" << std::endl;
21332176 return false ;
21342177 }
21352178 u.push_back (itnames - names.begin ());
21362179 }
21372180 auto it1 = idmapinv_keep.find (u);
21382181 auto it2 = idmapinv_discard.find (u);
21392182 if (it1 != idmapinv_keep.end () || it2 != idmapinv_discard.end ()) {
2140- std::cerr << " Error: In file " << keep_file << " , the following line is duplicated: " << line << std::endl;
2183+ if (!keep_file.empty ()) std::cerr << " Error: In file " << keep_file << " , the following line is duplicated: " << line << std::endl;
2184+ else std::cerr << " Error: the following line is duplicated: " << line << std::endl;
21412185 return false ;
21422186 } else if (discard && idmap_find (u) != -1 ) {
2143- std::cerr << " Error: In file " << keep_file << " , the following line cannot be used: " << line << std::endl;
2187+ if (!keep_file.empty ()) std::cerr << " Error: In file " << keep_file << " , the following line cannot be used: " << line << std::endl;
2188+ else std::cerr << " Cannot use the following line: " << line << std::endl;
21442189 return false ;
21452190 }
21462191 if (discard) {
@@ -2154,16 +2199,25 @@ struct SplitCode {
21542199 return true ;
21552200 }
21562201
2157- bool addFilterListGroup (std::string keep_file, bool discard=false ) {
2202+ bool addFilterListGroup (std::string keep_file, bool discard=false , bool is_text_corpus = false ) {
21582203 struct stat stFileInfo;
2159- auto intstat = stat (keep_file.c_str (), &stFileInfo);
2160- if (intstat != 0 ) {
2161- std::cerr << " Error: file not found " << keep_file << std::endl;
2162- return false ;
2204+ if (!is_text_corpus) {
2205+ auto intstat = stat (keep_file.c_str (), &stFileInfo);
2206+ if (intstat != 0 ) {
2207+ std::cerr << " Error: file not found " << keep_file << std::endl;
2208+ return false ;
2209+ }
21632210 }
21642211 std::ifstream kfile (keep_file);
2212+ std::istringstream textStream (keep_file);
2213+
2214+ if (is_text_corpus) return processFilterListGroup (textStream, discard, " " );
2215+ else return processFilterListGroup (kfile, discard, keep_file);
2216+ }
2217+
2218+ bool processFilterListGroup (std::istream& input, bool discard, std::string keep_file) {
21652219 std::string line;
2166- while (std::getline (kfile ,line)) {
2220+ while (std::getline (input ,line)) {
21672221 std::string ofile = " " ;
21682222 if (line.size () == 0 ) {
21692223 continue ;
@@ -2187,15 +2241,17 @@ struct SplitCode {
21872241 }
21882242 const auto & itnames = std::find (group_names.begin (), group_names.end (), name);
21892243 if (itnames == group_names.end ()) {
2190- std::cerr << " Error: File " << keep_file << " contains the group name \" " << name << " \" which does not exist" << std::endl;
2244+ if (!keep_file.empty ()) std::cerr << " Error: File " << keep_file << " contains the group name \" " << name << " \" which does not exist" << std::endl;
2245+ else std::cerr << " Group name \" " << name << " \" does not exist" << std::endl;
21912246 return false ;
21922247 }
21932248 u.push_back (itnames - group_names.begin ());
21942249 }
21952250 auto it1 = groupmapinv_keep.find (u);
21962251 auto it2 = groupmapinv_discard.find (u);
21972252 if (it1 != groupmapinv_keep.end () || it2 != groupmapinv_discard.end ()) {
2198- std::cerr << " Error: In file " << keep_file << " , the following line is duplicated: " << line << std::endl;
2253+ if (!keep_file.empty ()) std::cerr << " Error: In file " << keep_file << " , the following line is duplicated: " << line << std::endl;
2254+ else std::cerr << " The following line is duplicated: " << line << std::endl;
21992255 return false ;
22002256 }
22012257 if (discard) {
@@ -3940,6 +3996,8 @@ struct SplitCode {
39403996
39413997 std::vector<size_t > sub_assign_vec;
39423998
3999+ std::string _keep_str, _keep_grp_str, _remove_str, _remove_grp_str;
4000+
39434001 bool init;
39444002 bool discard_check;
39454003 bool keep_check;
0 commit comments