Skip to content

Commit 9c3d0fa

Browse files
committed
Merge #7892: Add full UTF-8 support to RPC
7982fce doc: Mention full UTF-8 support in release notes (Wladimir J. van der Laan) 6bbb4ef test: test utf-8 for labels in wallet (Wladimir J. van der Laan) a406fcb test: add ensure_ascii setting to AuthServiceProxy (Wladimir J. van der Laan) 60ab9b2 Squashed 'src/univalue/' changes from 2740c4f..f32df99 (Wladimir J. van der Laan)
2 parents 3f89a53 + 7982fce commit 9c3d0fa

File tree

14 files changed

+209
-44
lines changed

14 files changed

+209
-44
lines changed

doc/release-notes.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ RPC low-level changes
4343
32-bit and 64-bit platforms, and the txids were missing in the hashed data. This has been
4444
fixed, but this means that the output will be different than from previous versions.
4545

46+
- Full UTF-8 support in the RPC API. Non-ASCII characters in, for example,
47+
wallet labels have always been malformed because they weren't taken into account
48+
properly in JSON RPC processing. This is no longer the case. This also affects
49+
the GUI debug console.
50+
4651
C++11 and Python 3
4752
-------------------
4853

qa/rpc-tests/test_framework/authproxy.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,11 @@ def EncodeDecimal(o):
6767
class AuthServiceProxy(object):
6868
__id_count = 0
6969

70-
def __init__(self, service_url, service_name=None, timeout=HTTP_TIMEOUT, connection=None):
70+
# ensure_ascii: escape unicode as \uXXXX, passed to json.dumps
71+
def __init__(self, service_url, service_name=None, timeout=HTTP_TIMEOUT, connection=None, ensure_ascii=True):
7172
self.__service_url = service_url
7273
self._service_name = service_name
74+
self.ensure_ascii = ensure_ascii # can be toggled on the fly by tests
7375
self.__url = urlparse.urlparse(service_url)
7476
if self.__url.port is None:
7577
port = 80
@@ -134,12 +136,12 @@ def __call__(self, *args):
134136
AuthServiceProxy.__id_count += 1
135137

136138
log.debug("-%s-> %s %s"%(AuthServiceProxy.__id_count, self._service_name,
137-
json.dumps(args, default=EncodeDecimal)))
139+
json.dumps(args, default=EncodeDecimal, ensure_ascii=self.ensure_ascii)))
138140
postdata = json.dumps({'version': '1.1',
139141
'method': self._service_name,
140142
'params': args,
141-
'id': AuthServiceProxy.__id_count}, default=EncodeDecimal)
142-
response = self._request('POST', self.__url.path, postdata)
143+
'id': AuthServiceProxy.__id_count}, default=EncodeDecimal, ensure_ascii=self.ensure_ascii)
144+
response = self._request('POST', self.__url.path, postdata.encode('utf-8'))
143145
if response['error'] is not None:
144146
raise JSONRPCException(response['error'])
145147
elif 'result' not in response:
@@ -149,9 +151,9 @@ def __call__(self, *args):
149151
return response['result']
150152

151153
def _batch(self, rpc_call_list):
152-
postdata = json.dumps(list(rpc_call_list), default=EncodeDecimal)
154+
postdata = json.dumps(list(rpc_call_list), default=EncodeDecimal, ensure_ascii=self.ensure_ascii)
153155
log.debug("--> "+postdata)
154-
return self._request('POST', self.__url.path, postdata)
156+
return self._request('POST', self.__url.path, postdata.encode('utf-8'))
155157

156158
def _get_response(self):
157159
http_response = self.__conn.getresponse()
@@ -167,7 +169,7 @@ def _get_response(self):
167169
responsedata = http_response.read().decode('utf8')
168170
response = json.loads(responsedata, parse_float=decimal.Decimal)
169171
if "error" in response and response["error"] is None:
170-
log.debug("<-%s- %s"%(response["id"], json.dumps(response["result"], default=EncodeDecimal)))
172+
log.debug("<-%s- %s"%(response["id"], json.dumps(response["result"], default=EncodeDecimal, ensure_ascii=self.ensure_ascii)))
171173
else:
172174
log.debug("<-- "+responsedata)
173175
return response

qa/rpc-tests/wallet.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,20 @@ def run_test (self):
309309
balance_nodes = [self.nodes[i].getbalance() for i in range(3)]
310310
block_count = self.nodes[0].getblockcount()
311311

312+
# Check modes:
313+
# - True: unicode escaped as \u....
314+
# - False: unicode directly as UTF-8
315+
for mode in [True, False]:
316+
self.nodes[0].ensure_ascii = mode
317+
# unicode check: Basic Multilingual Plane, Supplementary Plane respectively
318+
for s in [u'рыба', u'𝅘𝅥𝅯']:
319+
addr = self.nodes[0].getaccountaddress(s)
320+
label = self.nodes[0].getaccount(addr)
321+
assert_equal(label, s)
322+
assert(s in self.nodes[0].listaccounts().keys())
323+
self.nodes[0].ensure_ascii = True # restore to default
324+
325+
# maintenance tests
312326
maintenance = [
313327
'-rescan',
314328
'-reindex',

src/univalue/Makefile.am

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ ACLOCAL_AMFLAGS = -I build-aux/m4
33
.INTERMEDIATE: $(GENBIN)
44

55
include_HEADERS = include/univalue.h
6-
noinst_HEADERS = lib/univalue_escapes.h
6+
noinst_HEADERS = lib/univalue_escapes.h lib/univalue_utffilter.h
77

88
lib_LTLIBRARIES = libunivalue.la
99

@@ -73,6 +73,10 @@ TEST_FILES = \
7373
$(TEST_DATA_DIR)/fail35.json \
7474
$(TEST_DATA_DIR)/fail36.json \
7575
$(TEST_DATA_DIR)/fail37.json \
76+
$(TEST_DATA_DIR)/fail38.json \
77+
$(TEST_DATA_DIR)/fail39.json \
78+
$(TEST_DATA_DIR)/fail40.json \
79+
$(TEST_DATA_DIR)/fail41.json \
7680
$(TEST_DATA_DIR)/fail3.json \
7781
$(TEST_DATA_DIR)/fail4.json \
7882
$(TEST_DATA_DIR)/fail5.json \
@@ -83,6 +87,7 @@ TEST_FILES = \
8387
$(TEST_DATA_DIR)/pass1.json \
8488
$(TEST_DATA_DIR)/pass2.json \
8589
$(TEST_DATA_DIR)/pass3.json \
86-
$(TEST_DATA_DIR)/round1.json
90+
$(TEST_DATA_DIR)/round1.json \
91+
$(TEST_DATA_DIR)/round2.json
8792

8893
EXTRA_DIST=$(TEST_FILES) $(GEN_SRCS)

src/univalue/configure.ac

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
m4_define([libunivalue_major_version], [1])
22
m4_define([libunivalue_minor_version], [1])
3-
m4_define([libunivalue_micro_version], [1])
4-
m4_define([libunivalue_interface_age], [1])
3+
m4_define([libunivalue_micro_version], [2])
4+
m4_define([libunivalue_interface_age], [2])
55
# If you need a modifier for the version number.
66
# Normally empty, but can be used to make "fixup" releases.
77
m4_define([libunivalue_extraversion], [])
@@ -14,7 +14,7 @@ m4_define([libunivalue_age], [m4_eval(libunivalue_binary_age - libunivalue_inter
1414
m4_define([libunivalue_version], [libunivalue_major_version().libunivalue_minor_version().libunivalue_micro_version()libunivalue_extraversion()])
1515

1616

17-
AC_INIT([univalue], [1.0.1],
17+
AC_INIT([univalue], [1.0.2],
1818
[http://github.com/jgarzik/univalue/])
1919

2020
dnl make the compilation flags quiet unless V=1 is used

src/univalue/lib/univalue_read.cpp

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <vector>
77
#include <stdio.h>
88
#include "univalue.h"
9+
#include "univalue_utffilter.h"
910

1011
using namespace std;
1112

@@ -174,41 +175,31 @@ enum jtokentype getJsonToken(string& tokenVal, unsigned int& consumed,
174175
raw++; // skip "
175176

176177
string valStr;
178+
JSONUTF8StringFilter writer(valStr);
177179

178180
while (*raw) {
179-
if (*raw < 0x20)
181+
if ((unsigned char)*raw < 0x20)
180182
return JTOK_ERR;
181183

182184
else if (*raw == '\\') {
183185
raw++; // skip backslash
184186

185187
switch (*raw) {
186-
case '"': valStr += "\""; break;
187-
case '\\': valStr += "\\"; break;
188-
case '/': valStr += "/"; break;
189-
case 'b': valStr += "\b"; break;
190-
case 'f': valStr += "\f"; break;
191-
case 'n': valStr += "\n"; break;
192-
case 'r': valStr += "\r"; break;
193-
case 't': valStr += "\t"; break;
188+
case '"': writer.push_back('\"'); break;
189+
case '\\': writer.push_back('\\'); break;
190+
case '/': writer.push_back('/'); break;
191+
case 'b': writer.push_back('\b'); break;
192+
case 'f': writer.push_back('\f'); break;
193+
case 'n': writer.push_back('\n'); break;
194+
case 'r': writer.push_back('\r'); break;
195+
case 't': writer.push_back('\t'); break;
194196

195197
case 'u': {
196198
unsigned int codepoint;
197199
if (hatoui(raw + 1, raw + 1 + 4, codepoint) !=
198200
raw + 1 + 4)
199201
return JTOK_ERR;
200-
201-
if (codepoint <= 0x7f)
202-
valStr.push_back((char)codepoint);
203-
else if (codepoint <= 0x7FF) {
204-
valStr.push_back((char)(0xC0 | (codepoint >> 6)));
205-
valStr.push_back((char)(0x80 | (codepoint & 0x3F)));
206-
} else if (codepoint <= 0xFFFF) {
207-
valStr.push_back((char)(0xE0 | (codepoint >> 12)));
208-
valStr.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
209-
valStr.push_back((char)(0x80 | (codepoint & 0x3F)));
210-
}
211-
202+
writer.push_back_u(codepoint);
212203
raw += 4;
213204
break;
214205
}
@@ -226,11 +217,13 @@ enum jtokentype getJsonToken(string& tokenVal, unsigned int& consumed,
226217
}
227218

228219
else {
229-
valStr += *raw;
220+
writer.push_back(*raw);
230221
raw++;
231222
}
232223
}
233224

225+
if (!writer.finalize())
226+
return JTOK_ERR;
234227
tokenVal = valStr;
235228
consumed = (raw - rawStart);
236229
return JTOK_STRING;

src/univalue/lib/univalue_utffilter.h

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
// Copyright 2016 Wladimir J. van der Laan
2+
// Distributed under the MIT software license, see the accompanying
3+
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4+
#ifndef UNIVALUE_UTFFILTER_H
5+
#define UNIVALUE_UTFFILTER_H
6+
7+
#include <string>
8+
9+
/**
10+
* Filter that generates and validates UTF-8, as well as collates UTF-16
11+
* surrogate pairs as specified in RFC4627.
12+
*/
13+
class JSONUTF8StringFilter
14+
{
15+
public:
16+
JSONUTF8StringFilter(std::string &s):
17+
str(s), is_valid(true), codepoint(0), state(0), surpair(0)
18+
{
19+
}
20+
// Write single 8-bit char (may be part of UTF-8 sequence)
21+
void push_back(unsigned char ch)
22+
{
23+
if (state == 0) {
24+
if (ch < 0x80) // 7-bit ASCII, fast direct pass-through
25+
str.push_back(ch);
26+
else if (ch < 0xc0) // Mid-sequence character, invalid in this state
27+
is_valid = false;
28+
else if (ch < 0xe0) { // Start of 2-byte sequence
29+
codepoint = (ch & 0x1f) << 6;
30+
state = 6;
31+
} else if (ch < 0xf0) { // Start of 3-byte sequence
32+
codepoint = (ch & 0x0f) << 12;
33+
state = 12;
34+
} else if (ch < 0xf8) { // Start of 4-byte sequence
35+
codepoint = (ch & 0x07) << 18;
36+
state = 18;
37+
} else // Reserved, invalid
38+
is_valid = false;
39+
} else {
40+
if ((ch & 0xc0) != 0x80) // Not a continuation, invalid
41+
is_valid = false;
42+
state -= 6;
43+
codepoint |= (ch & 0x3f) << state;
44+
if (state == 0)
45+
push_back_u(codepoint);
46+
}
47+
}
48+
// Write codepoint directly, possibly collating surrogate pairs
49+
void push_back_u(unsigned int codepoint)
50+
{
51+
if (state) // Only accept full codepoints in open state
52+
is_valid = false;
53+
if (codepoint >= 0xD800 && codepoint < 0xDC00) { // First half of surrogate pair
54+
if (surpair) // Two subsequent surrogate pair openers - fail
55+
is_valid = false;
56+
else
57+
surpair = codepoint;
58+
} else if (codepoint >= 0xDC00 && codepoint < 0xE000) { // Second half of surrogate pair
59+
if (surpair) { // Open surrogate pair, expect second half
60+
// Compute code point from UTF-16 surrogate pair
61+
append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint - 0xDC00));
62+
surpair = 0;
63+
} else // Second half doesn't follow a first half - fail
64+
is_valid = false;
65+
} else {
66+
if (surpair) // First half of surrogate pair not followed by second - fail
67+
is_valid = false;
68+
else
69+
append_codepoint(codepoint);
70+
}
71+
}
72+
// Check that we're in a state where the string can be ended
73+
// No open sequences, no open surrogate pairs, etc
74+
bool finalize()
75+
{
76+
if (state || surpair)
77+
is_valid = false;
78+
return is_valid;
79+
}
80+
private:
81+
std::string &str;
82+
bool is_valid;
83+
// Current UTF-8 decoding state
84+
unsigned int codepoint;
85+
int state; // Top bit to be filled in for next UTF-8 byte, or 0
86+
87+
// Keep track of the following state to handle the following section of
88+
// RFC4627:
89+
//
90+
// To escape an extended character that is not in the Basic Multilingual
91+
// Plane, the character is represented as a twelve-character sequence,
92+
// encoding the UTF-16 surrogate pair. So, for example, a string
93+
// containing only the G clef character (U+1D11E) may be represented as
94+
// "\uD834\uDD1E".
95+
//
96+
// Two subsequent \u.... may have to be replaced with one actual codepoint.
97+
unsigned int surpair; // First half of open UTF-16 surrogate pair, or 0
98+
99+
void append_codepoint(unsigned int codepoint)
100+
{
101+
if (codepoint <= 0x7f)
102+
str.push_back((char)codepoint);
103+
else if (codepoint <= 0x7FF) {
104+
str.push_back((char)(0xC0 | (codepoint >> 6)));
105+
str.push_back((char)(0x80 | (codepoint & 0x3F)));
106+
} else if (codepoint <= 0xFFFF) {
107+
str.push_back((char)(0xE0 | (codepoint >> 12)));
108+
str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
109+
str.push_back((char)(0x80 | (codepoint & 0x3F)));
110+
} else if (codepoint <= 0x1FFFFF) {
111+
str.push_back((char)(0xF0 | (codepoint >> 18)));
112+
str.push_back((char)(0x80 | ((codepoint >> 12) & 0x3F)));
113+
str.push_back((char)(0x80 | ((codepoint >> 6) & 0x3F)));
114+
str.push_back((char)(0x80 | (codepoint & 0x3F)));
115+
}
116+
}
117+
};
118+
119+
#endif

src/univalue/lib/univalue_write.cpp

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
#include "univalue.h"
99
#include "univalue_escapes.h"
1010

11-
// TODO: Using UTF8
12-
1311
using namespace std;
1412

1513
static string json_escape(const string& inS)
@@ -23,15 +21,8 @@ static string json_escape(const string& inS)
2321

2422
if (escStr)
2523
outS += escStr;
26-
27-
else if (ch < 0x80)
24+
else
2825
outS += ch;
29-
30-
else { // TODO handle UTF-8 properly
31-
char tmpesc[16];
32-
sprintf(tmpesc, "\\u%04x", ch);
33-
outS += tmpesc;
34-
}
3526
}
3627

3728
return outS;

src/univalue/test/fail38.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
["\ud834"]

src/univalue/test/fail39.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
["\udd61"]

0 commit comments

Comments
 (0)