Skip to content

Commit 873a1b4

Browse files
authored
Fix test failures with ReinterpretArray (#25)
next(x, endof(x)) is not generally valid with AbstractVectors even though it works for Vector. This fixes a test failure on Julia 0.7. The UTF32String(::Vector{Char}) constructor added a \0 char at the end of the string because it used reinterpret(UInt32, ...), which did not dispatch to the inner constructor directly as it used to do on 0.6.
1 parent d30435d commit 873a1b4

File tree

2 files changed

+25
-13
lines changed

2 files changed

+25
-13
lines changed

src/support.jl

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -88,14 +88,16 @@ function unsafe_checkstring(dat::AbstractVector{UInt8},
8888
flags::UInt = 0
8989
totalchar = num2byte = num3byte = num4byte = 0
9090
@inbounds while pos <= endpos
91-
ch, pos = next(dat, pos)
91+
ch = dat[pos]
92+
pos += 1
9293
totalchar += 1
9394
if ch > 0x7f
9495
# Check UTF-8 encoding
9596
if ch < 0xe0
9697
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
9798
(pos > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
98-
byt, pos = next(dat, pos)
99+
byt = dat[pos]
100+
pos += 1
99101
ch = get_continuation(ch & 0x3f, byt, pos)
100102
if ch > 0x7f
101103
num2byte += 1
@@ -110,20 +112,25 @@ function unsafe_checkstring(dat::AbstractVector{UInt8},
110112
elseif ch < 0xf0
111113
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
112114
(pos + 1 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
113-
byt, pos = next(dat, pos)
115+
byt = dat[pos]
116+
pos += 1
114117
ch = get_continuation(ch & 0x0f, byt, pos)
115-
byt, pos = next(dat, pos)
118+
byt = dat[pos]
119+
pos += 1
116120
ch = get_continuation(ch, byt, pos)
117121
# check for surrogate pairs, make sure correct
118122
if is_surrogate_codeunit(ch)
119123
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch))
120124
# next character *must* be a trailing surrogate character
121125
(pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
122-
byt, pos = next(dat, pos)
126+
byt = dat[pos]
127+
pos += 1
123128
(byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt))
124-
byt, pos = next(dat, pos)
129+
byt = dat[pos]
130+
pos += 1
125131
surr = get_continuation(0x0000d, byt, pos)
126-
byt, pos = next(dat, pos)
132+
byt = dat[pos]
133+
pos += 1
127134
surr = get_continuation(surr, byt, pos)
128135
!is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr))
129136
!accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
@@ -140,11 +147,14 @@ function unsafe_checkstring(dat::AbstractVector{UInt8},
140147
elseif ch < 0xf5
141148
# 4-byte UTF-8 sequence (i.e. characters > 0xffff)
142149
(pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
143-
byt, pos = next(dat, pos)
150+
byt = dat[pos]
151+
pos += 1
144152
ch = get_continuation(ch & 0x07, byt, pos)
145-
byt, pos = next(dat, pos)
153+
byt = dat[pos]
154+
pos += 1
146155
ch = get_continuation(ch, byt, pos)
147-
byt, pos = next(dat, pos)
156+
byt = dat[pos]
157+
pos += 1
148158
ch = get_continuation(ch, byt, pos)
149159
if ch > 0x10ffff
150160
throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch))
@@ -187,7 +197,8 @@ function unsafe_checkstring(
187197
flags::UInt = 0
188198
totalchar = num2byte = num3byte = num4byte = 0
189199
@inbounds while pos <= endpos
190-
ch, pos = next(dat, pos)
200+
ch = dat[pos]
201+
pos = nextind(dat, pos)
191202
totalchar += 1
192203
if ch > 0x7f
193204
if ch < 0x100
@@ -204,7 +215,8 @@ function unsafe_checkstring(
204215
elseif is_surrogate_lead(ch)
205216
pos > endpos && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch))
206217
# next character *must* be a trailing surrogate character
207-
ch, pos = next(dat, pos)
218+
ch = dat[pos]
219+
pos = nextind(dat, pos)
208220
!is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch))
209221
num4byte += 1
210222
if !(typeof(dat) <: AbstractVector{UInt16})

src/utf32.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ function convert(::Type{UTF32String}, str::UTF16String)
8989
# get number of words to create
9090
len, flags, num4byte = unsafe_checkstring(dat, 1, len>>>1)
9191
# No surrogate pairs, do optimized copy
92-
(flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{Char}(len), dat))
92+
(flags & UTF_UNICODE4) == 0 && @inbounds return UTF32String(copy!(Vector{UInt32}(len), dat))
9393
local ch::UInt32
9494
buf = Vector{UInt32}(len)
9595
out = 0

0 commit comments

Comments
 (0)