Skip to content

Commit d2c3a5c

Browse files
authored
Merge pull request #40 from sineme/port-utf1632prober-from-python
ported utf1632prober from chardet/chardet from python to ruby
2 parents 4716e29 + 6fb6a8d commit d2c3a5c

File tree

6 files changed

+265
-0
lines changed

6 files changed

+265
-0
lines changed

lib/rchardet.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
require 'rchardet/sjisprober'
5454
require 'rchardet/universaldetector'
5555
require 'rchardet/utf8prober'
56+
require 'rchardet/utf1632prober'
5657

5758
module CharDet
5859
def CharDet.detect(aBuf)

lib/rchardet/universaldetector.rb

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ def initialize
4242
@highBitDetector = /[\x80-\xFF]/n
4343
@escDetector = /(\033|\~\{)/n
4444
@escCharSetProber = nil
45+
@utf1632prober = nil
4546
@charSetProbers = []
4647
reset()
4748
end
@@ -56,6 +57,9 @@ def reset
5657
if @escCharSetProber
5758
@escCharSetProber.reset()
5859
end
60+
if @utf1632prober
61+
@utf1632prober.reset()
62+
end
5963
for prober in @charSetProbers
6064
prober.reset()
6165
end
@@ -117,6 +121,22 @@ def feed(aBuf)
117121
end
118122

119123
@lastChar = aBuf[-1, 1]
124+
125+
if !@utf1632prober
126+
@utf1632prober = UTF1632Prober.new()
127+
end
128+
129+
if @utf1632prober.get_state == EDetecting
130+
if @utf1632prober.feed(aBuf) == EFoundIt
131+
@result = {
132+
"encoding" => @utf1632prober.get_charset_name(),
133+
"confidence" => @utf1632prober.get_confidence()
134+
}
135+
@done = true
136+
return
137+
end
138+
end
139+
120140
if @inputState == EEscAscii
121141
if !@escCharSetProber
122142
@escCharSetProber = EscCharSetProber.new()

lib/rchardet/utf1632prober.rb

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
######################## BEGIN LICENSE BLOCK ########################
2+
# The Original Code is mozilla.org code.
3+
#
4+
# The Initial Developer of the Original Code is
5+
# Netscape Communications Corporation.
6+
# Portions created by the Initial Developer are Copyright (C) 1998
7+
# the Initial Developer. All Rights Reserved.
8+
#
9+
# Contributor(s):
10+
# Jeff Hodges - port to Ruby
11+
# Mark Pilgrim - port to Python
12+
#
13+
# This library is free software; you can redistribute it and/or
14+
# modify it under the terms of the GNU Lesser General Public
15+
# License as published by the Free Software Foundation; either
16+
# version 2.1 of the License, or (at your option) any later version.
17+
#
18+
# This library is distributed in the hope that it will be useful,
19+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
20+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21+
# Lesser General Public License for more details.
22+
#
23+
# You should have received a copy of the GNU Lesser General Public
24+
# License along with this library; if not, write to the Free Software
25+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26+
# 02110-1301 USA
27+
######################### END LICENSE BLOCK #########################
28+
29+
module CharDet
30+
MIN_CHARS_FOR_DETECTION = 20
31+
EXPECTED_RATIO = 0.94
32+
33+
class UTF1632Prober < CharSetProber
34+
def initialize
35+
super()
36+
@position = 0
37+
@zeros_at_mod = [0, 0, 0, 0]
38+
@nonzeros_at_mod = [0, 0, 0, 0]
39+
@state = EDetecting
40+
@quad = [0, 0, 0, 0]
41+
@invalid_utf16be = false
42+
@invalid_utf16le = false
43+
@invalid_utf32be = false
44+
@invalid_utf32le = false
45+
@first_half_surrogate_pair_detected_16be = false
46+
@first_half_surrogate_pair_detected_16le = false
47+
reset()
48+
end
49+
50+
def reset
51+
super()
52+
@position = 0
53+
@zeros_at_mod = [0, 0, 0, 0]
54+
@nonzeros_at_mod = [0, 0, 0, 0]
55+
@state = EDetecting
56+
@invalid_utf16be = false
57+
@invalid_utf16le = false
58+
@invalid_utf32be = false
59+
@invalid_utf32le = false
60+
@first_half_surrogate_pair_detected_16be = false
61+
@first_half_surrogate_pair_detected_16le = false
62+
@quad = [0, 0, 0, 0]
63+
end
64+
65+
def get_charset_name
66+
if is_likely_utf32be
67+
return "UTF-32BE"
68+
end
69+
if is_likely_utf32le
70+
return "UTF-32LE"
71+
end
72+
if is_likely_utf16be
73+
return "UTF-16BE"
74+
end
75+
if is_likely_utf16le
76+
return "UTF-16LE"
77+
end
78+
# default to something valid
79+
return "UTF-16"
80+
end
81+
82+
def feed(aBuf)
83+
aBuf.each_byte do |b|
84+
mod4 = @position % 4
85+
@quad[mod4] = b
86+
if mod4 == 3
87+
validate_utf32_characters(@quad)
88+
validate_utf16_characters(@quad[0..2])
89+
validate_utf16_characters(@quad[2..4])
90+
end
91+
if b == 0
92+
@zeros_at_mod[mod4] += 1
93+
else
94+
@nonzeros_at_mod[mod4] += 1
95+
end
96+
@position += 1
97+
end
98+
99+
return get_state()
100+
end
101+
102+
def get_state
103+
if [ENotMe, EFoundIt].include? @state
104+
# terminal, decided states
105+
return @state
106+
end
107+
if get_confidence > 0.80
108+
@state = EFoundIt
109+
elsif @position > 4 * 1024
110+
# if we get to 4kb into the file, and we can't conclude it's UTF,
111+
# let's give up
112+
@state = ENotMe
113+
end
114+
return @state
115+
end
116+
117+
def get_confidence
118+
if is_likely_utf16le || is_likely_utf16be || is_likely_utf32le || is_likely_utf32be
119+
0.85
120+
else
121+
0.00
122+
end
123+
end
124+
125+
private
126+
127+
def approx_32bit_chars
128+
return [1.0, @position / 4.0].max
129+
end
130+
131+
def approx_16bit_chars
132+
return [1.0, @position / 2.0].max
133+
end
134+
135+
def is_likely_utf32be
136+
approx_chars = approx_32bit_chars
137+
return approx_chars >= MIN_CHARS_FOR_DETECTION &&
138+
@zeros_at_mod[0] / approx_chars > EXPECTED_RATIO &&
139+
@zeros_at_mod[1] / approx_chars > EXPECTED_RATIO &&
140+
@zeros_at_mod[2] / approx_chars > EXPECTED_RATIO &&
141+
@nonzeros_at_mod[3] / approx_chars > EXPECTED_RATIO &&
142+
!@invalid_utf32be
143+
144+
end
145+
146+
def is_likely_utf32le
147+
approx_chars = approx_32bit_chars
148+
return approx_chars >= MIN_CHARS_FOR_DETECTION &&
149+
@nonzeros_at_mod[0] / approx_chars > EXPECTED_RATIO &&
150+
@zeros_at_mod[1] / approx_chars > EXPECTED_RATIO &&
151+
@zeros_at_mod[2] / approx_chars > EXPECTED_RATIO &&
152+
@zeros_at_mod[3] / approx_chars > EXPECTED_RATIO &&
153+
!@invalid_utf32le
154+
end
155+
156+
def is_likely_utf16be
157+
approx_chars = approx_16bit_chars
158+
return approx_chars >= MIN_CHARS_FOR_DETECTION &&
159+
(@nonzeros_at_mod[1] + @nonzeros_at_mod[3]) / approx_chars > EXPECTED_RATIO &&
160+
(@zeros_at_mod[0] + @zeros_at_mod[2]) / approx_chars > EXPECTED_RATIO &&
161+
!@invalid_utf16be
162+
end
163+
164+
def is_likely_utf16le
165+
approx_chars = approx_16bit_chars
166+
return approx_chars >= MIN_CHARS_FOR_DETECTION &&
167+
(@nonzeros_at_mod[0] + @nonzeros_at_mod[2]) / approx_chars > EXPECTED_RATIO &&
168+
(@zeros_at_mod[1] + @zeros_at_mod[3]) / approx_chars > EXPECTED_RATIO &&
169+
!@invalid_utf16le
170+
end
171+
172+
# @param [Array<Integer>] quad four consecutive bytes
173+
# @return [void]
174+
def validate_utf32_characters(quad)
175+
"" "
176+
Validate if the quad of bytes is valid UTF-32.
177+
178+
UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
179+
excluding 0x0000D800 - 0x0000DFFF
180+
181+
https://en.wikipedia.org/wiki/UTF-32
182+
" ""
183+
if quad[0] != 0 or quad[1] > 0x10 or quad[0] == 0 and quad[1] == 0 and (0xD8..0xDF).include?(quad[2])
184+
@invalid_utf32be = true
185+
end
186+
if quad[3] != 0 or quad[2] > 0x10 or quad[3] == 0 and quad[2] == 0 and (0xD8..0xDF).include?(quad[1])
187+
@invalid_utf32le = true
188+
end
189+
end
190+
191+
# @param [Array<Integer>] pair two consecutive bytes
192+
# @return [void]
193+
def validate_utf16_characters(pair)
194+
"" "
195+
Validate if the pair of bytes is valid UTF-16.
196+
197+
UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
198+
with an exception for surrogate pairs, which must be in the range
199+
0xD800-0xDBFF followed by 0xDC00-0xDFFF
200+
201+
https://en.wikipedia.org/wiki/UTF-16
202+
" ""
203+
if !@first_half_surrogate_pair_detected_16be
204+
if (0xD8..0xDB).include? pair[0]
205+
@first_half_surrogate_pair_detected_16be = true
206+
elsif (0xDC..0xDF).include? pair[0]
207+
@invalid_utf16be = true
208+
end
209+
else
210+
if (0xDC..0xDF).include? pair[0]
211+
@first_half_surrogate_pair_detected_16be = false
212+
else
213+
@invalid_utf16be = true
214+
end
215+
end
216+
217+
if not @first_half_surrogate_pair_detected_16le
218+
if (0xD8..0xDB).include? pair[1]
219+
@first_half_surrogate_pair_detected_16le = true
220+
elsif (0xDC..0xDF).include? pair[1]
221+
@invalid_utf16le = true
222+
end
223+
else
224+
if (0xDC..0xDF).include? pair[1]
225+
@first_half_surrogate_pair_detected_16le = false
226+
else
227+
@invalid_utf16le = true
228+
end
229+
end
230+
end
231+
end
232+
end
100 Bytes
Binary file not shown.
106 Bytes
Binary file not shown.

test/simple_test.rb

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,24 @@ def pending
6363
}
6464
end
6565

66+
it "detects UTF_16BE without BOM" do
67+
assert_chardet_spec_detect 'UTF-16BE_without_BOM', {
68+
"encoding" => 'UTF-16BE', "confidence" => 0.85
69+
}
70+
end
71+
6672
it "detects UTF_16LE" do
6773
assert_chardet_spec_detect 'UTF-16LE', {
6874
"encoding" => 'UTF-16LE', "confidence" => 1
6975
}
7076
end
7177

78+
it "detects UTF_16LE without BOM" do
79+
assert_chardet_spec_detect 'UTF-16LE_without_BOM', {
80+
"encoding" => 'UTF-16LE', "confidence" => 0.85
81+
}
82+
end
83+
7284
it "detects ISO_2022_JP" do
7385
assert_chardet_spec_detect 'ISO-2022-JP', {
7486
"encoding" => 'ISO-2022-JP', "confidence" => 0.99

0 commit comments

Comments
 (0)