1+ ######################## BEGIN LICENSE BLOCK ########################
2+ # The Original Code is mozilla.org code.
3+ #
4+ # The Initial Developer of the Original Code is
5+ # Netscape Communications Corporation.
6+ # Portions created by the Initial Developer are Copyright (C) 1998
7+ # the Initial Developer. All Rights Reserved.
8+ #
9+ # Contributor(s):
10+ # Jeff Hodges - port to Ruby
11+ # Mark Pilgrim - port to Python
12+ #
13+ # This library is free software; you can redistribute it and/or
14+ # modify it under the terms of the GNU Lesser General Public
15+ # License as published by the Free Software Foundation; either
16+ # version 2.1 of the License, or (at your option) any later version.
17+ #
18+ # This library is distributed in the hope that it will be useful,
19+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21+ # Lesser General Public License for more details.
22+ #
23+ # You should have received a copy of the GNU Lesser General Public
24+ # License along with this library; if not, write to the Free Software
25+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26+ # 02110-1301 USA
27+ ######################### END LICENSE BLOCK #########################
28+
29+ module CharDet
30+ MIN_CHARS_FOR_DETECTION = 20
31+ EXPECTED_RATIO = 0.94
32+
33+ class UTF1632Prober < CharSetProber
34+ def initialize
35+ super ( )
36+ @position = 0
37+ @zeros_at_mod = [ 0 , 0 , 0 , 0 ]
38+ @nonzeros_at_mod = [ 0 , 0 , 0 , 0 ]
39+ @state = EDetecting
40+ @quad = [ 0 , 0 , 0 , 0 ]
41+ @invalid_utf16be = false
42+ @invalid_utf16le = false
43+ @invalid_utf32be = false
44+ @invalid_utf32le = false
45+ @first_half_surrogate_pair_detected_16be = false
46+ @first_half_surrogate_pair_detected_16le = false
47+ reset ( )
48+ end
49+
50+ def reset
51+ super ( )
52+ @position = 0
53+ @zeros_at_mod = [ 0 , 0 , 0 , 0 ]
54+ @nonzeros_at_mod = [ 0 , 0 , 0 , 0 ]
55+ @state = EDetecting
56+ @invalid_utf16be = false
57+ @invalid_utf16le = false
58+ @invalid_utf32be = false
59+ @invalid_utf32le = false
60+ @first_half_surrogate_pair_detected_16be = false
61+ @first_half_surrogate_pair_detected_16le = false
62+ @quad = [ 0 , 0 , 0 , 0 ]
63+ end
64+
65+ def get_charset_name
66+ if is_likely_utf32be
67+ return "UTF-32BE"
68+ end
69+ if is_likely_utf32le
70+ return "UTF-32LE"
71+ end
72+ if is_likely_utf16be
73+ return "UTF-16BE"
74+ end
75+ if is_likely_utf16le
76+ return "UTF-16LE"
77+ end
78+ # default to something valid
79+ return "UTF-16"
80+ end
81+
82+ def feed ( aBuf )
83+ aBuf . each_byte do |b |
84+ mod4 = @position % 4
85+ @quad [ mod4 ] = b
86+ if mod4 == 3
87+ validate_utf32_characters ( @quad )
88+ validate_utf16_characters ( @quad [ 0 ..2 ] )
89+ validate_utf16_characters ( @quad [ 2 ..4 ] )
90+ end
91+ if b == 0
92+ @zeros_at_mod [ mod4 ] += 1
93+ else
94+ @nonzeros_at_mod [ mod4 ] += 1
95+ end
96+ @position += 1
97+ end
98+
99+ return get_state ( )
100+ end
101+
102+ def get_state
103+ if [ ENotMe , EFoundIt ] . include? @state
104+ # terminal, decided states
105+ return @state
106+ end
107+ if get_confidence > 0.80
108+ @state = EFoundIt
109+ elsif @position > 4 * 1024
110+ # if we get to 4kb into the file, and we can't conclude it's UTF,
111+ # let's give up
112+ @state = ENotMe
113+ end
114+ return @state
115+ end
116+
117+ def get_confidence
118+ if is_likely_utf16le || is_likely_utf16be || is_likely_utf32le || is_likely_utf32be
119+ 0.85
120+ else
121+ 0.00
122+ end
123+ end
124+
125+ private
126+
127+ def approx_32bit_chars
128+ return [ 1.0 , @position / 4.0 ] . max
129+ end
130+
131+ def approx_16bit_chars
132+ return [ 1.0 , @position / 2.0 ] . max
133+ end
134+
135+ def is_likely_utf32be
136+ approx_chars = approx_32bit_chars
137+ return approx_chars >= MIN_CHARS_FOR_DETECTION &&
138+ @zeros_at_mod [ 0 ] / approx_chars > EXPECTED_RATIO &&
139+ @zeros_at_mod [ 1 ] / approx_chars > EXPECTED_RATIO &&
140+ @zeros_at_mod [ 2 ] / approx_chars > EXPECTED_RATIO &&
141+ @nonzeros_at_mod [ 3 ] / approx_chars > EXPECTED_RATIO &&
142+ !@invalid_utf32be
143+
144+ end
145+
146+ def is_likely_utf32le
147+ approx_chars = approx_32bit_chars
148+ return approx_chars >= MIN_CHARS_FOR_DETECTION &&
149+ @nonzeros_at_mod [ 0 ] / approx_chars > EXPECTED_RATIO &&
150+ @zeros_at_mod [ 1 ] / approx_chars > EXPECTED_RATIO &&
151+ @zeros_at_mod [ 2 ] / approx_chars > EXPECTED_RATIO &&
152+ @zeros_at_mod [ 3 ] / approx_chars > EXPECTED_RATIO &&
153+ !@invalid_utf32le
154+ end
155+
156+ def is_likely_utf16be
157+ approx_chars = approx_16bit_chars
158+ return approx_chars >= MIN_CHARS_FOR_DETECTION &&
159+ ( @nonzeros_at_mod [ 1 ] + @nonzeros_at_mod [ 3 ] ) / approx_chars > EXPECTED_RATIO &&
160+ ( @zeros_at_mod [ 0 ] + @zeros_at_mod [ 2 ] ) / approx_chars > EXPECTED_RATIO &&
161+ !@invalid_utf16be
162+ end
163+
164+ def is_likely_utf16le
165+ approx_chars = approx_16bit_chars
166+ return approx_chars >= MIN_CHARS_FOR_DETECTION &&
167+ ( @nonzeros_at_mod [ 0 ] + @nonzeros_at_mod [ 2 ] ) / approx_chars > EXPECTED_RATIO &&
168+ ( @zeros_at_mod [ 1 ] + @zeros_at_mod [ 3 ] ) / approx_chars > EXPECTED_RATIO &&
169+ !@invalid_utf16le
170+ end
171+
172+ # @param [Array<Integer>] quad four consecutive bytes
173+ # @return [void]
174+ def validate_utf32_characters ( quad )
175+ "" "
176+ Validate if the quad of bytes is valid UTF-32.
177+
178+ UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
179+ excluding 0x0000D800 - 0x0000DFFF
180+
181+ https://en.wikipedia.org/wiki/UTF-32
182+ " ""
183+ if quad [ 0 ] != 0 or quad [ 1 ] > 0x10 or quad [ 0 ] == 0 and quad [ 1 ] == 0 and ( 0xD8 ..0xDF ) . include? ( quad [ 2 ] )
184+ @invalid_utf32be = true
185+ end
186+ if quad [ 3 ] != 0 or quad [ 2 ] > 0x10 or quad [ 3 ] == 0 and quad [ 2 ] == 0 and ( 0xD8 ..0xDF ) . include? ( quad [ 1 ] )
187+ @invalid_utf32le = true
188+ end
189+ end
190+
191+ # @param [Array<Integer>] pair two consecutive bytes
192+ # @return [void]
193+ def validate_utf16_characters ( pair )
194+ "" "
195+ Validate if the pair of bytes is valid UTF-16.
196+
197+ UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
198+ with an exception for surrogate pairs, which must be in the range
199+ 0xD800-0xDBFF followed by 0xDC00-0xDFFF
200+
201+ https://en.wikipedia.org/wiki/UTF-16
202+ " ""
203+ if !@first_half_surrogate_pair_detected_16be
204+ if ( 0xD8 ..0xDB ) . include? pair [ 0 ]
205+ @first_half_surrogate_pair_detected_16be = true
206+ elsif ( 0xDC ..0xDF ) . include? pair [ 0 ]
207+ @invalid_utf16be = true
208+ end
209+ else
210+ if ( 0xDC ..0xDF ) . include? pair [ 0 ]
211+ @first_half_surrogate_pair_detected_16be = false
212+ else
213+ @invalid_utf16be = true
214+ end
215+ end
216+
217+ if not @first_half_surrogate_pair_detected_16le
218+ if ( 0xD8 ..0xDB ) . include? pair [ 1 ]
219+ @first_half_surrogate_pair_detected_16le = true
220+ elsif ( 0xDC ..0xDF ) . include? pair [ 1 ]
221+ @invalid_utf16le = true
222+ end
223+ else
224+ if ( 0xDC ..0xDF ) . include? pair [ 1 ]
225+ @first_half_surrogate_pair_detected_16le = false
226+ else
227+ @invalid_utf16le = true
228+ end
229+ end
230+ end
231+ end
232+ end
0 commit comments