Merge pull request #40 from sineme/port-utf1632prober-from-python

grosser · web-flow · commit d2c3a5cb60c7 · 2025-09-09T16:59:27.000-07:00
ported utf1632prober from chardet/chardet from python to ruby
diff --git a/lib/rchardet.rb b/lib/rchardet.rb
@@ -53,6 +53,7 @@
 require 'rchardet/sjisprober'
 require 'rchardet/universaldetector'
 require 'rchardet/utf8prober'
+require 'rchardet/utf1632prober'
 
 module CharDet
   def CharDet.detect(aBuf)
diff --git a/lib/rchardet/universaldetector.rb b/lib/rchardet/universaldetector.rb
@@ -42,6 +42,7 @@ def initialize
       @highBitDetector = /[\x80-\xFF]/n
       @escDetector = /(\033|\~\{)/n
       @escCharSetProber = nil
+      @utf1632prober = nil
       @charSetProbers = []
       reset()
     end
@@ -56,6 +57,9 @@ def reset
       if @escCharSetProber
         @escCharSetProber.reset()
       end
+      if @utf1632prober
+        @utf1632prober.reset()
+      end
       for prober in @charSetProbers
         prober.reset()
       end
@@ -117,6 +121,22 @@ def feed(aBuf)
       end
 
       @lastChar = aBuf[-1, 1]
+
+      if !@utf1632prober
+        @utf1632prober = UTF1632Prober.new()
+      end
+
+      if @utf1632prober.get_state == EDetecting
+        if @utf1632prober.feed(aBuf) == EFoundIt
+          @result = {
+            "encoding" => @utf1632prober.get_charset_name(),
+            "confidence" => @utf1632prober.get_confidence()
+          }
+        @done = true
+        return
+        end
+      end
+
       if @inputState == EEscAscii
         if !@escCharSetProber
           @escCharSetProber = EscCharSetProber.new()
diff --git a/lib/rchardet/utf1632prober.rb b/lib/rchardet/utf1632prober.rb
@@ -0,0 +1,232 @@
+######################## BEGIN LICENSE BLOCK ########################
+# The Original Code is mozilla.org code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 1998
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+#   Jeff Hodges - port to Ruby
+#   Mark Pilgrim - port to Python
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+# 
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+# 
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+# 02110-1301  USA
+######################### END LICENSE BLOCK #########################
+
+module CharDet
+  MIN_CHARS_FOR_DETECTION = 20
+  EXPECTED_RATIO = 0.94
+
+  class UTF1632Prober < CharSetProber
+    def initialize
+      super()
+      @position = 0
+      @zeros_at_mod = [0, 0, 0, 0]
+      @nonzeros_at_mod = [0, 0, 0, 0]
+      @state = EDetecting
+      @quad = [0, 0, 0, 0]
+      @invalid_utf16be = false
+      @invalid_utf16le = false
+      @invalid_utf32be = false
+      @invalid_utf32le = false
+      @first_half_surrogate_pair_detected_16be = false
+      @first_half_surrogate_pair_detected_16le = false
+      reset()
+    end
+
+    def reset
+      super()
+      @position = 0
+      @zeros_at_mod = [0, 0, 0, 0]
+      @nonzeros_at_mod = [0, 0, 0, 0]
+      @state = EDetecting
+      @invalid_utf16be = false
+      @invalid_utf16le = false
+      @invalid_utf32be = false
+      @invalid_utf32le = false
+      @first_half_surrogate_pair_detected_16be = false
+      @first_half_surrogate_pair_detected_16le = false
+      @quad = [0, 0, 0, 0]
+    end
+
+    def get_charset_name
+      if is_likely_utf32be
+        return "UTF-32BE"
+      end
+      if is_likely_utf32le
+        return "UTF-32LE"
+      end
+      if is_likely_utf16be
+        return "UTF-16BE"
+      end
+      if is_likely_utf16le
+        return "UTF-16LE"
+      end
+      # default to something valid
+      return "UTF-16"
+    end
+
+    def feed(aBuf)
+      aBuf.each_byte do |b|
+        mod4 = @position % 4
+        @quad[mod4] = b
+        if mod4 == 3
+          validate_utf32_characters(@quad)
+          validate_utf16_characters(@quad[0..2])
+          validate_utf16_characters(@quad[2..4])
+        end
+        if b == 0
+          @zeros_at_mod[mod4] += 1
+        else
+          @nonzeros_at_mod[mod4] += 1
+        end
+        @position += 1
+      end
+
+      return get_state()
+    end
+
+    def get_state
+      if [ENotMe, EFoundIt].include? @state
+        # terminal, decided states
+        return @state
+      end
+      if get_confidence > 0.80
+        @state = EFoundIt
+      elsif @position > 4 * 1024
+        # if we get to 4kb into the file, and we can't conclude it's UTF,
+        # let's give up
+        @state = ENotMe
+      end
+      return @state
+    end
+
+    def get_confidence
+      if is_likely_utf16le || is_likely_utf16be || is_likely_utf32le || is_likely_utf32be
+        0.85
+      else
+        0.00
+      end
+    end
+
+    private
+
+    def approx_32bit_chars
+      return [1.0, @position / 4.0].max
+    end
+
+    def approx_16bit_chars
+      return [1.0, @position / 2.0].max
+    end
+
+    def is_likely_utf32be
+      approx_chars = approx_32bit_chars
+      return approx_chars >= MIN_CHARS_FOR_DETECTION &&
+             @zeros_at_mod[0] / approx_chars > EXPECTED_RATIO &&
+             @zeros_at_mod[1] / approx_chars > EXPECTED_RATIO &&
+             @zeros_at_mod[2] / approx_chars > EXPECTED_RATIO &&
+             @nonzeros_at_mod[3] / approx_chars > EXPECTED_RATIO &&
+             !@invalid_utf32be
+
+    end
+
+    def is_likely_utf32le
+      approx_chars = approx_32bit_chars
+      return approx_chars >= MIN_CHARS_FOR_DETECTION &&
+             @nonzeros_at_mod[0] / approx_chars > EXPECTED_RATIO &&
+             @zeros_at_mod[1] / approx_chars > EXPECTED_RATIO &&
+             @zeros_at_mod[2] / approx_chars > EXPECTED_RATIO &&
+             @zeros_at_mod[3] / approx_chars > EXPECTED_RATIO &&
+             !@invalid_utf32le
+    end
+
+    def is_likely_utf16be
+      approx_chars = approx_16bit_chars
+      return approx_chars >= MIN_CHARS_FOR_DETECTION &&
+             (@nonzeros_at_mod[1] + @nonzeros_at_mod[3]) / approx_chars > EXPECTED_RATIO &&
+             (@zeros_at_mod[0] + @zeros_at_mod[2]) / approx_chars > EXPECTED_RATIO &&
+             !@invalid_utf16be
+    end
+
+    def is_likely_utf16le
+      approx_chars = approx_16bit_chars
+      return approx_chars >= MIN_CHARS_FOR_DETECTION &&
+             (@nonzeros_at_mod[0] + @nonzeros_at_mod[2]) / approx_chars > EXPECTED_RATIO &&
+             (@zeros_at_mod[1] + @zeros_at_mod[3]) / approx_chars > EXPECTED_RATIO &&
+             !@invalid_utf16le
+    end
+
+    # @param [Array<Integer>] quad four consecutive bytes
+    # @return [void]
+    def validate_utf32_characters(quad)
+      "" "
+        Validate if the quad of bytes is valid UTF-32.
+
+        UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
+        excluding 0x0000D800 - 0x0000DFFF
+
+        https://en.wikipedia.org/wiki/UTF-32
+        " ""
+      if quad[0] != 0 or quad[1] > 0x10 or quad[0] == 0 and quad[1] == 0 and (0xD8..0xDF).include?(quad[2])
+        @invalid_utf32be = true
+      end
+      if quad[3] != 0 or quad[2] > 0x10 or quad[3] == 0 and quad[2] == 0 and (0xD8..0xDF).include?(quad[1])
+        @invalid_utf32le = true
+      end
+    end
+
+    # @param [Array<Integer>] pair two consecutive bytes
+    # @return [void]
+    def validate_utf16_characters(pair)
+      "" "
+        Validate if the pair of bytes is  valid UTF-16.
+
+        UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
+        with an exception for surrogate pairs, which must be in the range
+        0xD800-0xDBFF followed by 0xDC00-0xDFFF
+
+        https://en.wikipedia.org/wiki/UTF-16
+        " ""
+      if !@first_half_surrogate_pair_detected_16be
+        if (0xD8..0xDB).include? pair[0]
+          @first_half_surrogate_pair_detected_16be = true
+        elsif (0xDC..0xDF).include? pair[0]
+          @invalid_utf16be = true
+        end
+      else
+        if (0xDC..0xDF).include? pair[0]
+          @first_half_surrogate_pair_detected_16be = false
+        else
+          @invalid_utf16be = true
+        end
+      end
+
+      if not @first_half_surrogate_pair_detected_16le
+        if (0xD8..0xDB).include? pair[1]
+          @first_half_surrogate_pair_detected_16le = true
+        elsif (0xDC..0xDF).include? pair[1]
+          @invalid_utf16le = true
+        end
+      else
+        if (0xDC..0xDF).include? pair[1]
+          @first_half_surrogate_pair_detected_16le = false
+        else
+          @invalid_utf16le = true
+        end
+      end
+    end
+  end
+end
diff --git a/test/simple_assets/UTF-16BE_without_BOM.txt b/test/simple_assets/UTF-16BE_without_BOM.txt
diff --git a/test/simple_assets/UTF-16LE_without_BOM.txt b/test/simple_assets/UTF-16LE_without_BOM.txt
diff --git a/test/simple_test.rb b/test/simple_test.rb
@@ -63,12 +63,24 @@ def pending
     }
   end
 
+  it "detects UTF_16BE without BOM" do
+    assert_chardet_spec_detect 'UTF-16BE_without_BOM', {
+      "encoding" => 'UTF-16BE', "confidence" => 0.85
+    }
+  end
+
   it "detects UTF_16LE" do
     assert_chardet_spec_detect 'UTF-16LE', {
       "encoding" => 'UTF-16LE', "confidence" => 1
     }
   end
 
+  it "detects UTF_16LE without BOM" do
+    assert_chardet_spec_detect 'UTF-16LE_without_BOM', {
+      "encoding" => 'UTF-16LE', "confidence" => 0.85
+    }
+  end
+
   it "detects ISO_2022_JP" do
     assert_chardet_spec_detect 'ISO-2022-JP', {
       "encoding" => 'ISO-2022-JP', "confidence" => 0.99