Skip to content

Commit 4bf723c

Browse files
authored
filter input non-ASCII char before process (#6739)
* filter input non-ASCII char before process * revert a output code * replace non ascii with space, remove comment before process
1 parent 94078ea commit 4bf723c

File tree

2 files changed

+105
-35
lines changed

2 files changed

+105
-35
lines changed

source/source_io/read_input.cpp

Lines changed: 95 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ std::string to_dir(const std::string& str)
6969
return str_dir;
7070
}
7171

72-
void read_information(std::ifstream& ifs, std::vector<std::string>& output, const std::string& delimiters)
72+
void read_information(std::stringstream& ifs, std::vector<std::string>& output, const std::string& delimiters)
7373
{
7474
std::string line;
7575
getline(ifs, line);
@@ -88,6 +88,62 @@ void read_information(std::ifstream& ifs, std::vector<std::string>& output, cons
8888

8989
bool ReadInput::check_mode = false;
9090

91+
bool filter_nonascii_and_comment(std::ifstream& ifs,
92+
std::stringstream& out_ascii_stream)
93+
{
94+
//
95+
if (!ifs.is_open()) {
96+
if (!ifs) return false;
97+
}
98+
99+
std::streampos old_pos = ifs.tellg();
100+
ifs.clear();
101+
ifs.seekg(0, std::ios::beg);
102+
103+
char c;
104+
while (ifs.get(c)) {
105+
// If comment start, skip until end of line (but keep the newline)
106+
if (c == '#') {
107+
char d;
108+
bool newline_found = false;
109+
while (ifs.get(d)) {
110+
if (d == '\n' || d == '\r') {
111+
// preserve line break in output
112+
out_ascii_stream.put('\n');
113+
// If CRLF, consume the LF after CR (already wrote a single '\n')
114+
if (d == '\r' && ifs.peek() == '\n') {
115+
ifs.get(d); // consume '\n'
116+
}
117+
newline_found = true;
118+
break;
119+
}
120+
}
121+
if (!newline_found) {
122+
// reached EOF while skipping comment
123+
break;
124+
}
125+
continue;
126+
}
127+
128+
unsigned char uc = static_cast<unsigned char>(c);
129+
if (uc <= 0x7F) {
130+
// ASCII character
131+
out_ascii_stream.put(c);
132+
}
133+
else {
134+
// replace non-ASCII with space character
135+
out_ascii_stream.put(' ');
136+
}
137+
}
138+
139+
// recover ifstream state and position
140+
ifs.clear();
141+
ifs.seekg(old_pos, std::ios::beg);
142+
143+
return true;
144+
}
145+
146+
91147
ReadInput::ReadInput(const int& rank)
92148
{
93149
this->rank = rank;
@@ -228,32 +284,38 @@ void ReadInput::read_txt_input(Parameter& param, const std::string& filename)
228284
{
229285
ModuleBase::TITLE("ReadInput", "read_txt_input");
230286

231-
std::ifstream ifs(filename.c_str(), std::ios::in);
287+
std::stringstream ascii_stream;
232288

233-
if (!ifs)
234289
{
235-
std::cout << " Can't find the INPUT file." << std::endl;
236-
ModuleBase::WARNING_QUIT("Input::Init", "Error during readin parameters.", 1);
237-
}
290+
std::ifstream ifs(filename.c_str(), std::ios::in);
238291

239-
ifs.clear();
240-
ifs.seekg(0);
292+
if (!ifs)
293+
{
294+
std::cout << " Can't find the INPUT file." << std::endl;
295+
ModuleBase::WARNING_QUIT("Input::Init", "Error during readin parameters.", 1);
296+
}
241297

242-
std::string word;
243-
int ierr = 0;
298+
ifs.clear();
299+
ifs.seekg(0);
300+
301+
filter_nonascii_and_comment(ifs, ascii_stream);
302+
ifs.clear();
244303

245-
// ifs >> std::setiosflags(ios::uppercase);
246-
ifs.rdstate();
247-
while (ifs.good())
304+
// file close after reading
305+
}
306+
307+
int ierr = 0;
308+
ascii_stream.rdstate();
309+
while (ascii_stream.good())
248310
{
249-
ifs >> word;
250-
ifs.ignore(150, '\n');
311+
std::string word;
312+
ascii_stream >> word;
313+
ascii_stream.ignore(150, '\n');
251314
if (word == "INPUT_PARAMETERS")
252315
{
253316
ierr = 1;
254317
break;
255318
}
256-
ifs.rdstate();
257319
}
258320

259321
if (ierr == 0)
@@ -266,14 +328,13 @@ void ReadInput::read_txt_input(Parameter& param, const std::string& filename)
266328
"Bad parameter, please check the input parameters in file INPUT", 1);
267329
}
268330

269-
ifs.rdstate();
270-
// the `word1` is moved here and is renamed to improve the code-readability
271-
std::string word_; // temporary variable to store the keyword read-in
272-
while (ifs.good())
331+
ascii_stream.rdstate();
332+
while (ascii_stream.good())
273333
{
274-
ifs >> word_;
275-
if (ifs.eof()) { break; }
276-
word = FmtCore::lower(word_); // the lowercase of the keyword
334+
std::string word; // temporary variable to store the keyword read-in
335+
ascii_stream >> word;
336+
if (ascii_stream.eof()) { break; }
337+
word = FmtCore::lower(word); // the lowercase of the keyword
277338
auto it = std::find_if(input_lists.begin(), input_lists.end(),
278339
[&word](const std::pair<std::string, Input_Item>& item) { return item.first == word; });
279340
if (it != this->input_lists.end()) // find the keyword
@@ -286,7 +347,7 @@ void ReadInput::read_txt_input(Parameter& param, const std::string& filename)
286347
ModuleBase::WARNING_QUIT("ReadInput", warningstr);
287348
}
288349
// qianrui delete '/' 2024-07-10, because path has '/' head.
289-
read_information(ifs, p_item->str_values, "#!");
350+
read_information(ascii_stream, p_item->str_values, "#!");
290351
}
291352
else // otherwise, it should be a comment or an unrecognized parameter
292353
{
@@ -299,25 +360,24 @@ void ReadInput::read_txt_input(Parameter& param, const std::string& filename)
299360
// otherwise, it is a comment. However, ...
300361
// but it is not always to be shorter than 150 characters
301362
// we can use ignore to skip the rest of the line
302-
ifs.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
363+
ascii_stream.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
303364
}
304365

305-
ifs.rdstate();
306-
if (ifs.eof())
366+
ascii_stream.rdstate();
367+
if (ascii_stream.eof())
307368
{
308369
break;
309370
}
310-
else if (ifs.bad())
371+
else if (ascii_stream.bad())
311372
{
312-
std::cout << " Bad input parameters. " << std::endl;
313-
exit(1);
373+
ModuleBase::WARNING_QUIT("Input",
374+
" Bad input parameters. ", 1);
314375
}
315-
else if (ifs.fail())
376+
else if (ascii_stream.fail())
316377
{
317-
std::cout << " word = " << word << std::endl;
318-
std::cout << " Fail to read parameters. " << std::endl;
319-
ifs.clear();
320-
exit(1);
378+
ascii_stream.clear();
379+
ModuleBase::WARNING_QUIT("Input",
380+
" fail to read parameters. ", 1);
321381
}
322382
}
323383

source/source_io/read_input.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
#include "source_io/module_parameter/parameter.h"
66

77
#include <string>
8+
#include <fstream>
9+
#include <sstream>
10+
811

912
namespace ModuleIO
1013
{
@@ -152,6 +155,13 @@ std::string to_dir(const std::string& str);
152155
// return a warning string if the string is not found in the vector
153156
std::string nofound_str(std::vector<std::string> init_chgs, const std::string& str);
154157

158+
159+
// filter non-ASCII characters from ifstream and output to stringstream
160+
// return true if successful, false otherwise
161+
bool filter_nonascii_and_comment(std::ifstream& ifs,
162+
std::stringstream& out_ascii_stream);
163+
164+
155165
} // namespace ModuleIO
156166

157167
#endif

0 commit comments

Comments
 (0)