Skip to content

Commit a74cda2

Browse files
committed
Added support for UTF-8 during decoding.
The JSON specification does not require it, but some json encoders are using utf-8 encoding for json encoding. Added related autotest case.
1 parent f94820c commit a74cda2

File tree

2 files changed

+139
-46
lines changed

2 files changed

+139
-46
lines changed

library/kernel/json_string.e

Lines changed: 117 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -61,128 +61,190 @@ feature -- Access
6161
item: STRING
6262
-- Contents with escaped entities if any
6363

64+
feature -- Conversion
65+
6466
unescaped_string_8: STRING_8
65-
-- Unescaped string from `item'
67+
-- Unescaped string from `item'.
68+
--| note: valid only if `item' does not encode any unicode character.
69+
local
70+
s: like item
71+
do
72+
s := item
73+
create Result.make (s.count)
74+
unescape_to_string_8 (Result)
75+
end
76+
77+
unescaped_string_32: STRING_32
78+
-- Unescaped string 32 from `item'
79+
--| some encoders uses UTF-8 , and not the recommended pure json encoding
80+
--| thus, let's support the UTF-8 encoding during decoding.
81+
local
82+
s: READABLE_STRING_8
83+
do
84+
s := item
85+
create Result.make (s.count)
86+
unescape_to_string_32 (Result)
87+
end
88+
89+
representation: STRING
90+
-- String representation of `item' with escaped entities if any
91+
do
92+
create Result.make (item.count + 2)
93+
Result.append_character ('%"')
94+
Result.append (item)
95+
Result.append_character ('%"')
96+
end
97+
98+
unescape_to_string_8 (a_output: STRING_8)
99+
-- Unescape string `item' into `a_output'.
100+
--| note: valid only if `item' does not encode any unicode character.
66101
local
67102
s: like item
68103
i, n: INTEGER
69104
c: CHARACTER
70105
do
71106
s := item
72107
n := s.count
73-
create Result.make (n)
74108
from i := 1 until i > n loop
75109
c := s[i]
76110
if c = '\' then
77111
if i < n then
78112
inspect s[i+1]
79113
when '\' then
80-
Result.append_character ('\')
114+
a_output.append_character ('\')
81115
i := i + 2
82116
when '%"' then
83-
Result.append_character ('%"')
117+
a_output.append_character ('%"')
84118
i := i + 2
85119
when 'b' then
86-
Result.append_character ('%B')
120+
a_output.append_character ('%B')
87121
i := i + 2
88122
when 'f' then
89-
Result.append_character ('%F')
123+
a_output.append_character ('%F')
90124
i := i + 2
91125
when 'n' then
92-
Result.append_character ('%N')
126+
a_output.append_character ('%N')
93127
i := i + 2
94128
when 'r' then
95-
Result.append_character ('%R')
129+
a_output.append_character ('%R')
96130
i := i + 2
97131
when 't' then
98-
Result.append_character ('%T')
132+
a_output.append_character ('%T')
99133
i := i + 2
100134
when 'u' then
101135
--| Leave Unicode \uXXXX unescaped
102-
Result.append_character ('\')
136+
a_output.append_character ('\')
103137
i := i + 1
104138
else
105-
Result.append_character ('\')
139+
a_output.append_character ('\')
106140
i := i + 1
107141
end
108142
else
109-
Result.append_character ('\')
143+
a_output.append_character ('\')
110144
i := i + 1
111145
end
112146
else
113-
Result.append_character (c)
147+
a_output.append_character (c)
114148
i := i + 1
115149
end
116150
end
117151
end
118152

119-
unescaped_string_32: STRING_32
120-
-- Unescaped string 32 from `item'
153+
unescape_to_string_32 (a_output: STRING_32)
154+
-- Unescape string `item' into `a_output' string 32.
155+
--| some encoders uses UTF-8 , and not the recommended pure json encoding
156+
--| thus, let's support the UTF-8 encoding during decoding.
121157
local
122-
s: like item
158+
s: READABLE_STRING_8
123159
i, n: INTEGER
124-
c: CHARACTER
125-
hex: STRING
160+
c: NATURAL_32
161+
ch: CHARACTER_8
162+
hex: READABLE_STRING_8
126163
do
127164
s := item
128165
n := s.count
129-
create Result.make (n)
130166
from i := 1 until i > n loop
131-
c := s[i]
132-
if c = '\' then
167+
ch := s.item (i)
168+
if ch = '\' then
133169
if i < n then
134170
inspect s[i+1]
135171
when '\' then
136-
Result.append_character ('\')
172+
a_output.append_character ('\')
137173
i := i + 2
138174
when '%"' then
139-
Result.append_character ('%"')
175+
a_output.append_character ('%"')
140176
i := i + 2
141177
when 'b' then
142-
Result.append_character ('%B')
178+
a_output.append_character ('%B')
143179
i := i + 2
144180
when 'f' then
145-
Result.append_character ('%F')
181+
a_output.append_character ('%F')
146182
i := i + 2
147183
when 'n' then
148-
Result.append_character ('%N')
184+
a_output.append_character ('%N')
149185
i := i + 2
150186
when 'r' then
151-
Result.append_character ('%R')
187+
a_output.append_character ('%R')
152188
i := i + 2
153-
when 'T' then
154-
Result.append_character ('%T')
189+
when 't' then
190+
a_output.append_character ('%T')
155191
i := i + 2
156192
when 'u' then
157-
hex := s.substring (i+2, i+2+4 - 1)
193+
hex := s.substring (i + 2, i + 5) -- i+2 , i+2+4-1
158194
if hex.count = 4 then
159-
Result.append_code (hexadecimal_to_natural_32 (hex))
195+
a_output.append_code (hexadecimal_to_natural_32 (hex))
160196
end
161-
i := i + 2 + 4
197+
i := i + 6 -- i +2 +4
162198
else
163-
Result.append_character ('\')
199+
a_output.append_character ('\')
164200
i := i + 1
165201
end
166202
else
167-
Result.append_character ('\')
203+
a_output.append_character ('\')
168204
i := i + 1
169205
end
170206
else
171-
Result.append_character (c.to_character_32)
207+
c := ch.natural_32_code
208+
if c <= 0x7F then
209+
-- 0xxxxxxx
210+
check ch = c.to_character_32 end
211+
a_output.append_character (ch)
212+
elseif c <= 0xDF then
213+
-- 110xxxxx 10xxxxxx
214+
i := i + 1
215+
if i <= n then
216+
a_output.append_code (
217+
((c & 0x1F) |<< 6) |
218+
(s.code (i) & 0x3F)
219+
)
220+
end
221+
elseif c <= 0xEF then
222+
-- 1110xxxx 10xxxxxx 10xxxxxx
223+
i := i + 2
224+
if i <= n then
225+
a_output.append_code (
226+
((c & 0xF) |<< 12) |
227+
((s.code (i - 1) & 0x3F) |<< 6) |
228+
(s.code (i) & 0x3F)
229+
)
230+
end
231+
elseif c <= 0xF7 then
232+
-- 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
233+
i := i + 3
234+
if i <= n then
235+
a_output.append_code (
236+
((c & 0x7) |<< 18) |
237+
((s.code (i - 2) & 0x3F) |<< 12) |
238+
((s.code (i - 1) & 0x3F) |<< 6) |
239+
(s.code (i) & 0x3F)
240+
)
241+
end
242+
end
172243
i := i + 1
173244
end
174245
end
175246
end
176247

177-
representation: STRING
178-
-- String representation of `item' with escaped entities if any
179-
do
180-
create Result.make (item.count + 2)
181-
Result.append_character ('%"')
182-
Result.append (item)
183-
Result.append_character ('%"')
184-
end
185-
186248
feature -- Visitor pattern
187249

188250
accept (a_visitor: JSON_VISITOR)
@@ -231,8 +293,18 @@ feature {NONE} -- Implementation
231293

232294
is_hexadecimal (s: READABLE_STRING_8): BOOLEAN
233295
-- Is `s' an hexadecimal value?
296+
local
297+
i: INTEGER
234298
do
235-
Result := across s as scur all scur.item.is_hexa_digit end
299+
from
300+
Result := True
301+
i := 1
302+
until
303+
i > s.count or not Result
304+
loop
305+
Result := s[i].is_hexa_digit
306+
i := i + 1
307+
end
236308
end
237309

238310
hexadecimal_to_natural_32 (s: READABLE_STRING_8): NATURAL_32

test/autotest/test_suite/test_json_suite.e

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
note
1+
note
22
description: "[
33
Eiffel tests that can be executed by testing tool.
44
]"
@@ -62,6 +62,27 @@ feature -- Tests Pass
6262
end
6363
end
6464

65+
test_json_utf_8_pass1
66+
local
67+
parse_json: like new_json_parser
68+
utf: UTF_CONVERTER
69+
s: READABLE_STRING_32
70+
do
71+
s := {STRING_32} "{ %"nihaoma%": %"你好吗\t?%" }"
72+
73+
parse_json := new_json_parser (utf.string_32_to_utf_8_string_8 (s))
74+
json_value := parse_json.parse_json
75+
assert ("utf8.pass1.json", parse_json.is_parsed = True)
76+
if
77+
attached {JSON_OBJECT} json_value as jo and then
78+
attached {JSON_STRING} jo.item ("nihaoma") as js
79+
then
80+
assert ("utf8.nihaoma", js.unescaped_string_32.same_string ({STRING_32} "你好吗%T?"))
81+
else
82+
assert ("utf8.nihaoma", False)
83+
end
84+
end
85+
6586
feature -- Tests Failures
6687
test_json_fail1
6788
--

0 commit comments

Comments
 (0)