Skip to content

Commit 9b0e437

Browse files
committed
allow --keep, --keep-grp, --remove, and --remove-grp to be in config file
1 parent f514d2c commit 9b0e437

File tree

1 file changed

+76
-18
lines changed

1 file changed

+76
-18
lines changed

src/SplitCode.h

Lines changed: 76 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1433,19 +1433,45 @@ struct SplitCode {
14331433
std::string line;
14341434
bool header_read = false;
14351435
std::vector<std::string> h;
1436+
bool _keep = false;
1437+
bool _keep_grp = false;
1438+
bool _remove = false;
1439+
bool _remove_grp = false;
14361440
while (std::getline(cfile,line)) {
14371441
if (line.size() == 0) {
1442+
_keep = false;
1443+
_keep_grp = false;
1444+
_remove = false;
1445+
_remove_grp = false;
14381446
continue;
14391447
}
14401448
if (line[0] == '#') {
14411449
continue;
14421450
}
1443-
if (line[0] == '@') {
1451+
if (line[0] == '@' || (_keep || _keep_grp || _remove || _remove_grp)) {
14441452
std::stringstream ss(line);
14451453
std::string field;
14461454
std::string value;
14471455
ss >> field >> value;
1448-
if (field == "@qtrim-5") {
1456+
if (_keep || _keep_grp || _remove || _remove_grp) { // Read continuous multi-line value (until an empty line)
1457+
std::string sline = field + " " + value;
1458+
while (ss >> value) {
1459+
sline = " " + value;
1460+
}
1461+
value = sline + "\n";
1462+
if (_keep) _keep_str += value;
1463+
if (_keep_grp) _keep_grp_str += value;
1464+
if (_remove) _remove_str += value;
1465+
if (_remove_grp) _remove_grp_str += value;
1466+
} else if (field == "@keep:") {
1467+
_keep = true;
1468+
} else if (field == "@keep-grp:") {
1469+
_keep_grp = true;
1470+
} else if (field == "@remove:") {
1471+
_remove = true;
1472+
} else if (field == "@remove-grp:") {
1473+
_remove_grp = true;
1474+
} else if (field == "@qtrim-5") {
14491475
this->quality_trimming_5 = true;
14501476
} else if (field == "@qtrim-3") {
14511477
this->quality_trimming_3 = true;
@@ -1643,6 +1669,13 @@ struct SplitCode {
16431669
}
16441670
}
16451671
}
1672+
1673+
// Do some final processing: i.e. if the keep/discard text corpus were provided in the config file, process them now
1674+
if (!_keep_str.empty()) addFilterList(_keep_str, false, true);
1675+
if (!_remove_str.empty()) addFilterList(_remove_str, true, true);
1676+
if (!_keep_grp_str.empty()) addFilterListGroup(_keep_grp_str, false, true);
1677+
if (!_remove_grp_str.empty()) addFilterListGroup(_remove_grp_str, true, true);
1678+
16461679
checkInit();
16471680
return true;
16481681
}
@@ -2096,16 +2129,25 @@ struct SplitCode {
20962129
return true;
20972130
}
20982131

2099-
bool addFilterList(std::string keep_file, bool discard=false) {
2132+
bool addFilterList(std::string keep_file, bool discard=false, bool is_text_corpus = false) {
21002133
struct stat stFileInfo;
21012134
auto intstat = stat(keep_file.c_str(), &stFileInfo);
2102-
if (intstat != 0) {
2103-
std::cerr << "Error: file not found " << keep_file << std::endl;
2104-
return false;
2135+
if (!is_text_corpus) {
2136+
if (intstat != 0) {
2137+
std::cerr << "Error: file not found " << keep_file << std::endl;
2138+
return false;
2139+
}
21052140
}
21062141
std::ifstream kfile(keep_file);
2142+
std::istringstream textStream(keep_file);
2143+
2144+
if (is_text_corpus) return processFilterList(textStream, discard, "");
2145+
else return processFilterList(kfile, discard, keep_file);
2146+
}
2147+
2148+
bool processFilterList(std::istream& input, bool discard, std::string keep_file) {
21072149
std::string line;
2108-
while (std::getline(kfile,line)) {
2150+
while (std::getline(input,line)) {
21092151
std::string ofile = "";
21102152
if (line.size() == 0) {
21112153
continue;
@@ -2129,18 +2171,21 @@ struct SplitCode {
21292171
}
21302172
const auto& itnames = std::find(names.begin(), names.end(), name);
21312173
if (itnames == names.end()) {
2132-
std::cerr << "Error: File " << keep_file << " contains the name \"" << name << "\" which does not exist" << std::endl;
2174+
if (!keep_file.empty()) std::cerr << "Error: File " << keep_file << " contains the name \"" << name << "\" which does not exist" << std::endl;
2175+
else std::cerr << "Name \"" << name << "\" does not exist" << std::endl;
21332176
return false;
21342177
}
21352178
u.push_back(itnames - names.begin());
21362179
}
21372180
auto it1 = idmapinv_keep.find(u);
21382181
auto it2 = idmapinv_discard.find(u);
21392182
if (it1 != idmapinv_keep.end() || it2 != idmapinv_discard.end()) {
2140-
std::cerr << "Error: In file " << keep_file << ", the following line is duplicated: " << line << std::endl;
2183+
if (!keep_file.empty()) std::cerr << "Error: In file " << keep_file << ", the following line is duplicated: " << line << std::endl;
2184+
else std::cerr << "Error: the following line is duplicated: " << line << std::endl;
21412185
return false;
21422186
} else if (discard && idmap_find(u) != -1) {
2143-
std::cerr << "Error: In file " << keep_file << ", the following line cannot be used: " << line << std::endl;
2187+
if (!keep_file.empty()) std::cerr << "Error: In file " << keep_file << ", the following line cannot be used: " << line << std::endl;
2188+
else std::cerr << "Cannot use the following line: " << line << std::endl;
21442189
return false;
21452190
}
21462191
if (discard) {
@@ -2154,16 +2199,25 @@ struct SplitCode {
21542199
return true;
21552200
}
21562201

2157-
bool addFilterListGroup(std::string keep_file, bool discard=false) {
2202+
bool addFilterListGroup(std::string keep_file, bool discard=false, bool is_text_corpus = false) {
21582203
struct stat stFileInfo;
2159-
auto intstat = stat(keep_file.c_str(), &stFileInfo);
2160-
if (intstat != 0) {
2161-
std::cerr << "Error: file not found " << keep_file << std::endl;
2162-
return false;
2204+
if (!is_text_corpus) {
2205+
auto intstat = stat(keep_file.c_str(), &stFileInfo);
2206+
if (intstat != 0) {
2207+
std::cerr << "Error: file not found " << keep_file << std::endl;
2208+
return false;
2209+
}
21632210
}
21642211
std::ifstream kfile(keep_file);
2212+
std::istringstream textStream(keep_file);
2213+
2214+
if (is_text_corpus) return processFilterListGroup(textStream, discard, "");
2215+
else return processFilterListGroup(kfile, discard, keep_file);
2216+
}
2217+
2218+
bool processFilterListGroup(std::istream& input, bool discard, std::string keep_file) {
21652219
std::string line;
2166-
while (std::getline(kfile,line)) {
2220+
while (std::getline(input,line)) {
21672221
std::string ofile = "";
21682222
if (line.size() == 0) {
21692223
continue;
@@ -2187,15 +2241,17 @@ struct SplitCode {
21872241
}
21882242
const auto& itnames = std::find(group_names.begin(), group_names.end(), name);
21892243
if (itnames == group_names.end()) {
2190-
std::cerr << "Error: File " << keep_file << " contains the group name \"" << name << "\" which does not exist" << std::endl;
2244+
if (!keep_file.empty()) std::cerr << "Error: File " << keep_file << " contains the group name \"" << name << "\" which does not exist" << std::endl;
2245+
else std::cerr << "Group name \"" << name << "\" does not exist" << std::endl;
21912246
return false;
21922247
}
21932248
u.push_back(itnames - group_names.begin());
21942249
}
21952250
auto it1 = groupmapinv_keep.find(u);
21962251
auto it2 = groupmapinv_discard.find(u);
21972252
if (it1 != groupmapinv_keep.end() || it2 != groupmapinv_discard.end()) {
2198-
std::cerr << "Error: In file " << keep_file << ", the following line is duplicated: " << line << std::endl;
2253+
if (!keep_file.empty()) std::cerr << "Error: In file " << keep_file << ", the following line is duplicated: " << line << std::endl;
2254+
else std::cerr << "The following line is duplicated: " << line << std::endl;
21992255
return false;
22002256
}
22012257
if (discard) {
@@ -3940,6 +3996,8 @@ struct SplitCode {
39403996

39413997
std::vector<size_t> sub_assign_vec;
39423998

3999+
std::string _keep_str, _keep_grp_str, _remove_str, _remove_grp_str;
4000+
39434001
bool init;
39444002
bool discard_check;
39454003
bool keep_check;

0 commit comments

Comments
 (0)