Skip to content

Commit ac76508

Browse files
committed
Fix reinterpret with wrong alignment
1 parent e8a2176 commit ac76508

File tree

2 files changed

+51
-25
lines changed

2 files changed

+51
-25
lines changed

src/utf16.jl

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -222,20 +222,31 @@ end
222222

223223
function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
224224
isempty(bytes) && return UTF16String(UInt16[0])
225-
isodd(length(bytes)) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0))
226-
data = reinterpret(UInt16, bytes)
227-
# check for byte-order mark (BOM):
228-
if data[1] == 0xfeff # native byte order
229-
d = Vector{UInt16}(length(data))
230-
copy!(d,1, data,2, length(data)-1)
231-
elseif data[1] == 0xfffe # byte-swapped
232-
d = Vector{UInt16}(length(data))
233-
for i = 2:length(data)
234-
d[i-1] = bswap(data[i])
225+
nb = length(bytes)
226+
isodd(nb) && throw(UnicodeError(UTF_ERR_ODD_BYTES_16, length(bytes), 0))
227+
b1 = bytes[1]
228+
b2 = bytes[2]
229+
if b1 == 0xfe && b2 == 0xff
230+
offset = 1
231+
swap = false
232+
elseif b1 == 0xff && b2 == 0xfe
233+
offset = 1
234+
swap = true
235+
else
236+
offset = 0
237+
swap = false
238+
end
239+
len = nb ÷ 2 - offset
240+
d = Vector{UInt16}(len + 1)
241+
if swap
242+
@inbounds for i in 1:len
243+
ib = i + offset
244+
bhi = bytes[ib * 2 - 1]
245+
blo = bytes[ib * 2]
246+
d[i] = (UInt16(bhi) << 8) | blo
235247
end
236248
else
237-
d = Vector{UInt16}(length(data) + 1)
238-
copy!(d,1, data,1, length(data)) # assume native byte order
249+
unsafe_copy!(Ptr{UInt8}(pointer(d)), pointer(bytes, offset * 2 + 1), len * 2)
239250
end
240251
d[end] = 0 # NULL terminate
241252
!isvalid(UTF16String, d) && throw(UnicodeError(UTF_ERR_INVALID_16,0,0))

src/utf32.jl

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ function convert(::Type{UTF32String}, dat::AbstractVector{UInt32})
125125
end
126126

127127
convert(::Type{UTF32String}, data::AbstractVector{Int32}) =
128-
convert(UTF32String, reinterpret(UInt32, convert(Vector{T}, data)))
128+
convert(UTF32String, reinterpret(UInt32, convert(Vector{Int32}, data)))
129129

130130
convert(::Type{UTF32String}, data::AbstractVector{Char}) =
131131
convert(UTF32String, map(UInt32, data))
@@ -151,20 +151,35 @@ unsafe_convert{T<:Union{UInt32,Int32,Char}}(::Type{Ptr{T}}, s::UTF32String) =
151151

152152
function convert(T::Type{UTF32String}, bytes::AbstractArray{UInt8})
153153
isempty(bytes) && return empty_utf32
154-
length(bytes) & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0))
155-
data = reinterpret(UInt32, bytes)
156-
# check for byte-order mark (BOM):
157-
if data[1] == 0x0000feff # native byte order
158-
d = Vector{UInt32}(length(data))
159-
copy!(d,1, data, 2, length(data)-1)
160-
elseif data[1] == 0xfffe0000 # byte-swapped
161-
d = Vector{UInt32}(length(data))
162-
for i = 2:length(data)
163-
@inbounds d[i-1] = bswap(data[i])
154+
nb = length(bytes)
155+
nb & 3 != 0 && throw(UnicodeError(UTF_ERR_ODD_BYTES_32,0,0))
156+
b1 = bytes[1]
157+
b2 = bytes[2]
158+
b3 = bytes[3]
159+
b4 = bytes[4]
160+
if b1 == 0 && b2 == 0 && b3 == 0xfe && b4 == 0xff
161+
offset = 1
162+
swap = false
163+
elseif b1 == 0xff && b2 == 0xfe && b3 == 0 && b4 == 0
164+
offset = 1
165+
swap = true
166+
else
167+
offset = 0
168+
swap = false
169+
end
170+
len = nb ÷ 4 - offset
171+
d = Vector{UInt32}(len + 1)
172+
if swap
173+
@inbounds for i in 1:len
174+
ib = i + offset
175+
b1 = UInt32(bytes[ib * 2 - 1])
176+
b2 = UInt32(bytes[ib * 2])
177+
b3 = UInt32(bytes[ib * 2 + 1])
178+
b4 = UInt32(bytes[ib * 2 + 2])
179+
d[i] = (b1 << 24) | (b2 << 16) | (b3 << 8) | b4
164180
end
165181
else
166-
d = Vector{UInt32}(length(data) + 1)
167-
copy!(d, 1, data, 1, length(data)) # assume native byte order
182+
unsafe_copy!(Ptr{UInt8}(pointer(d)), pointer(bytes, offset * 4 + 1), len * 4)
168183
end
169184
d[end] = 0 # NULL terminate
170185
UTF32String(d)

0 commit comments

Comments
 (0)