Skip to content

Commit b729d70

Browse files
authored
Merge pull request #1902 from Shaikh-Ubaid/escape_unescape_improvements
Escape unescape improvements
2 parents 549ef01 + 027ffef commit b729d70

22 files changed

+104
-90
lines changed

src/libasr/asdl_cpp.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1662,12 +1662,12 @@ def visitField(self, field, cons):
16621662
elif field.type == "string" and not field.seq:
16631663
if field.opt:
16641664
self.emit("if (x.m_%s) {" % field.name, 2)
1665-
self.emit( 's.append("\\"" + get_escaped_str(x.m_%s) + "\\"");' % field.name, 3)
1665+
self.emit( 's.append("\\"" + str_escape_c(x.m_%s) + "\\"");' % field.name, 3)
16661666
self.emit("} else {", 2)
16671667
self.emit( 's.append("()");', 3)
16681668
self.emit("}", 2)
16691669
else:
1670-
self.emit('s.append("\\"" + get_escaped_str(x.m_%s) + "\\"");' % field.name, 2)
1670+
self.emit('s.append("\\"" + str_escape_c(x.m_%s) + "\\"");' % field.name, 2)
16711671
elif field.type == "int" and not field.seq:
16721672
if field.opt:
16731673
self.emit("if (x.m_%s) {" % field.name, 2)
@@ -1934,12 +1934,12 @@ def visitField(self, field, cons):
19341934
elif field.type == "string" and not field.seq:
19351935
if field.opt:
19361936
self.emit("if (x.m_%s) {" % field.name, 2)
1937-
self.emit( 's.append("\\"" + get_escaped_str(x.m_%s) + "\\"");' % field.name, 3)
1937+
self.emit( 's.append("\\"" + str_escape_c(x.m_%s) + "\\"");' % field.name, 3)
19381938
self.emit("} else {", 2)
19391939
self.emit( 's.append("[]");', 3)
19401940
self.emit("}", 2)
19411941
else:
1942-
self.emit('s.append("\\"" + get_escaped_str(x.m_%s) + "\\"");' % field.name, 2)
1942+
self.emit('s.append("\\"" + str_escape_c(x.m_%s) + "\\"");' % field.name, 2)
19431943
elif field.type == "int" and not field.seq:
19441944
if field.opt:
19451945
self.emit("if (x.m_%s) {" % field.name, 2)

src/libasr/codegen/asr_to_c_cpp.h

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,24 +1036,7 @@ R"(#include <stdio.h>
10361036

10371037

10381038
void visit_StringConstant(const ASR::StringConstant_t &x) {
1039-
src = "\"";
1040-
std::string s = x.m_s;
1041-
for (size_t idx = 0; idx < s.size(); idx++) {
1042-
if (s[idx] == '\n') {
1043-
src += "\\n";
1044-
} else if (s[idx] == '\t') {
1045-
src += "\\t";
1046-
} else if (s[idx] == '\r') {
1047-
src += "\\r";
1048-
}else if (s[idx] == '\\') {
1049-
src += "\\\\";
1050-
} else if (s[idx] == '\"') {
1051-
src += "\\\"";
1052-
} else {
1053-
src += s[idx];
1054-
}
1055-
}
1056-
src += "\"";
1039+
src = "\"" + str_escape_c(x.m_s) + "\"";
10571040
last_expr_precedence = 2;
10581041
}
10591042

src/libasr/codegen/wasm_to_wat.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include <fstream>
22

33
#include <libasr/assert.h>
4+
#include <libasr/string_utils.h>
45
#include <libasr/codegen/wasm_decoder.h>
56
#include <libasr/codegen/wasm_to_wat.h>
67

@@ -292,16 +293,15 @@ class WATVisitor : public WASMDecoder<WATVisitor>,
292293
" align=" + std::to_string(1U << mem_align);
293294
}
294295

295-
std::string get_escaped_str(const std::string &s, bool is_iov) {
296+
std::string str_escape_wat(const std::string &s, bool is_iov) {
297+
if (!is_iov) {
298+
return str_escape_c(s);
299+
}
296300
std::string escaped_str = "";
297301
for (auto ch:s) {
298-
if (!is_iov && ch >= 32) {
299-
escaped_str += ch;
300-
} else {
301-
std::string byte(2, ' ');
302-
snprintf(byte.data(), 3, "%02x", uint8_t(ch));
303-
escaped_str += "\\" + byte;
304-
}
302+
std::string byte(2, ' ');
303+
snprintf(byte.data(), 3, "%02x", uint8_t(ch));
304+
escaped_str += "\\" + byte;
305305
}
306306
return escaped_str;
307307
}
@@ -417,7 +417,7 @@ class WATVisitor : public WASMDecoder<WATVisitor>,
417417
}
418418
result += indent + "(data (;" + std::to_string(i) + ";) (" +
419419
date_segment_insts + ") \"" +
420-
get_escaped_str(data_segments[i].text, (i % 2 == 0)) + "\")";
420+
str_escape_wat(data_segments[i].text, (i % 2 == 0)) + "\")";
421421
}
422422

423423
result += "\n)\n";

src/libasr/containers.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,10 @@ struct Str {
278278
size_t size() const {
279279
return n;
280280
}
281+
282+
char back() const {
283+
return p[n - 1];
284+
}
281285
};
282286

283287
static_assert(std::is_standard_layout<Str>::value);

src/libasr/string_utils.cpp

Lines changed: 57 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -87,29 +87,6 @@ std::string replace(const std::string &s,
8787
return std::regex_replace(s, std::regex(regex), replace);
8888
}
8989

90-
std::string get_escaped_str(const std::string &s) {
91-
std::ostringstream o;
92-
for (auto c = s.cbegin(); c != s.cend(); c++) {
93-
switch (*c) {
94-
case '"': o << "\\\""; break;
95-
case '\\': o << "\\\\"; break;
96-
case '\b': o << "\\b"; break;
97-
case '\f': o << "\\f"; break;
98-
case '\n': o << "\\n"; break;
99-
case '\r': o << "\\r"; break;
100-
case '\t': o << "\\t"; break;
101-
default:
102-
if ('\x00' <= *c && *c <= '\x1f') {
103-
o << "\\u"
104-
<< std::hex << std::setw(4) << std::setfill('0') << static_cast<int>(*c);
105-
} else {
106-
o << *c;
107-
}
108-
}
109-
}
110-
return o.str();
111-
}
112-
11390
std::string read_file(const std::string &filename)
11491
{
11592
std::ifstream ifs(filename.c_str(), std::ios::in | std::ios::binary
@@ -154,9 +131,33 @@ std::string join_paths(const std::vector<std::string> &paths) {
154131
return p;
155132
}
156133

157-
char* unescape_string(Allocator &al, LCompilers::Str &s) {
158-
std::string x;
159-
for (size_t idx=0; idx < s.size(); idx++) {
134+
std::string str_escape_c(const std::string &s) {
135+
std::ostringstream o;
136+
for (auto c = s.cbegin(); c != s.cend(); c++) {
137+
switch (*c) {
138+
case '"': o << "\\\""; break;
139+
case '\\': o << "\\\\"; break;
140+
case '\b': o << "\\b"; break;
141+
case '\f': o << "\\f"; break;
142+
case '\n': o << "\\n"; break;
143+
case '\r': o << "\\r"; break;
144+
case '\t': o << "\\t"; break;
145+
default:
146+
if ('\x00' <= *c && *c <= '\x1f') {
147+
o << "\\u"
148+
<< std::hex << std::setw(4) << std::setfill('0') << static_cast<int>(*c);
149+
} else {
150+
o << *c;
151+
}
152+
}
153+
}
154+
return o.str();
155+
}
156+
157+
char* str_unescape_c(Allocator &al, LCompilers::Str &s) {
158+
std::string x = "";
159+
size_t idx = 0;
160+
for (; idx + 1 < s.size(); idx++) {
160161
if (s[idx] == '\\' && s[idx+1] == '\n') { // continuation character
161162
idx++;
162163
} else if (s[idx] == '\\' && s[idx+1] == 'n') {
@@ -187,6 +188,36 @@ char* unescape_string(Allocator &al, LCompilers::Str &s) {
187188
x += s[idx];
188189
}
189190
}
191+
if (idx < s.size()) {
192+
x += s[idx];
193+
}
194+
return LCompilers::s2c(al, x);
195+
}
196+
197+
std::string str_escape_fortran_double_quote(const std::string &s) {
198+
std::ostringstream o;
199+
for (auto c = s.cbegin(); c != s.cend(); c++) {
200+
switch (*c) {
201+
case '"': o << "\"\""; break;
202+
}
203+
}
204+
return o.str();
205+
}
206+
207+
char* str_unescape_fortran(Allocator &al, LCompilers::Str &s, char ch) {
208+
std::string x = "";
209+
size_t idx = 0;
210+
for (; idx + 1 < s.size(); idx++) {
211+
if (s[idx] == ch && s[idx + 1] == ch) {
212+
x += s[idx];
213+
idx++;
214+
} else {
215+
x += s[idx];
216+
}
217+
}
218+
if (idx < s.size()) {
219+
x += s[idx];
220+
}
190221
return LCompilers::s2c(al, x);
191222
}
192223

src/libasr/string_utils.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,6 @@ char *s2c(Allocator &al, const std::string &s);
2525
std::string replace(const std::string &s,
2626
const std::string &regex, const std::string &replace);
2727

28-
// Escapes special characters from the given string.
29-
// It is used during AST/R to Json conversion.
30-
std::string get_escaped_str(const std::string &s);
31-
3228
std::string read_file(const std::string &filename);
3329

3430
// Returns the parent path to the given path
@@ -38,7 +34,15 @@ bool is_relative_path(const std::string &path);
3834
// Joins paths (paths can be empty)
3935
std::string join_paths(const std::vector<std::string> &paths);
4036

41-
char* unescape_string(Allocator &al, LCompilers::Str &s);
37+
// Escapes special characters from the given string
38+
// using C style escaping
39+
std::string str_escape_c(const std::string &s);
40+
char* str_unescape_c(Allocator &al, LCompilers::Str &s);
41+
42+
// Escapes double quote characters from the given string
43+
// given string must be enclosed in double quotes
44+
std::string str_escape_fortran_double_quote(const std::string &s);
45+
char* str_unescape_fortran(Allocator &al, LCompilers::Str &s, char ch);
4246

4347
} // namespace LCompilers
4448

src/lpython/parser/semantics.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -798,8 +798,8 @@ static inline ast_t* concat_string(Allocator &al, Location &l,
798798
x.c_str(p.m_a), expr_contextType::Load)
799799
// `x.int_n` is of type BigInt but we store the int64_t directly in AST
800800
#define INTEGER(x, l) make_ConstantInt_t(p.m_a, l, x, nullptr)
801-
#define STRING1(x, l) make_ConstantStr_t(p.m_a, l, unescape_string(p.m_a, x), nullptr)
802-
#define STRING2(x, y, l) concat_string(p.m_a, l, EXPR(x), unescape_string(p.m_a, y), nullptr)
801+
#define STRING1(x, l) make_ConstantStr_t(p.m_a, l, str_unescape_c(p.m_a, x), nullptr)
802+
#define STRING2(x, y, l) concat_string(p.m_a, l, EXPR(x), str_unescape_c(p.m_a, y), nullptr)
803803
#define STRING3(id, x, l) PREFIX_STRING(p.m_a, l, name2char(id), x.c_str(p.m_a))
804804
#define STRING4(x, s, l) concat_string(p.m_a, l, EXPR(x), "", EXPR(s))
805805
#define FLOAT(x, l) make_ConstantFloat_t(p.m_a, l, x, nullptr)
@@ -864,7 +864,7 @@ static inline ast_t *PREFIX_STRING(Allocator &al, Location &l, char *prefix, cha
864864
} else if (strcmp(prefix, "b") == 0) {
865865
LCompilers::Str s_;
866866
s_.from_str(al, std::string(s));
867-
std::string str = std::string(unescape_string(al, s_));
867+
std::string str = std::string(str_unescape_c(al, s_));
868868
str = "b'" + str + "'";
869869
tmp = make_ConstantBytes_t(al, l, LCompilers::s2c(al, str), nullptr);
870870
} else if ( strcmp(prefix, "br") == 0 || strcmp(prefix, "rb") == 0) {

src/lpython/parser/tokenizer.re

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -825,7 +825,7 @@ std::string pickle_token(int token, const YYSTYPE &yystype)
825825
} else if (token == yytokentype::TK_IMAG_NUM) {
826826
t += " " + std::to_string(yystype.f) + "j";
827827
} else if (token == yytokentype::TK_STRING) {
828-
t = t + " " + "\"" + yystype.string.str() + "\"";
828+
t = t + " " + "\"" + str_escape_c(yystype.string.str()) + "\"";
829829
} else if (token == yytokentype::TK_TYPE_COMMENT) {
830830
t = t + " " + "\"" + yystype.string.str() + "\"";
831831
} else if (token == yytokentype::TK_TYPE_IGNORE) {

tests/reference/tokens-docstring1-1355fbb.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"outfile": null,
77
"outfile_hash": null,
88
"stdout": "tokens-docstring1-1355fbb.stdout",
9-
"stdout_hash": "1b85fc7f73cdf02de4658833853717555d29e098188ad737ab1a0ac1",
9+
"stdout_hash": "9afa056946f77dcfa0a5aa89b3ff738274836892169e03c14ee14a8f",
1010
"stderr": null,
1111
"stderr_hash": null,
1212
"returncode": 0

tests/reference/tokens-docstring1-1355fbb.stdout

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@
55
(TOKEN ":") 11:11
66
(NEWLINE) 12:12
77
(TOKEN "indent") 13:16
8-
(TOKEN "string" "A multi-line
9-
docstring.
10-
") 17:54
8+
(TOKEN "string" "A multi-line\n docstring.\n ") 17:54
119
(NEWLINE) 55:55
1210
(NEWLINE) 56:56
1311
(TOKEN "dedent") 56:56
@@ -18,10 +16,7 @@
1816
(TOKEN ":") 68:68
1917
(NEWLINE) 69:69
2018
(TOKEN "indent") 70:73
21-
(TOKEN "string" "
22-
A multi-line
23-
docstring.
24-
") 74:116
19+
(TOKEN "string" "\n A multi-line\n docstring.\n ") 74:116
2520
(NEWLINE) 117:117
2621
(NEWLINE) 118:118
2722
(TOKEN "dedent") 118:118
@@ -36,9 +31,6 @@
3631
(NEWLINE) 167:167
3732
(NEWLINE) 168:168
3833
(TOKEN "dedent") 168:168
39-
(TOKEN "string" "
40-
A multi-line
41-
docstring.
42-
") 169:199
34+
(TOKEN "string" "\nA multi-line\ndocstring.\n") 169:199
4335
(NEWLINE) 200:200
4436
(EOF) 201:201

0 commit comments

Comments
 (0)