diff --git a/lib/rchardet/jpcntx.rb b/lib/rchardet/jpcntx.rb index b53f3f3..37bee2c 100644 --- a/lib/rchardet/jpcntx.rb +++ b/lib/rchardet/jpcntx.rb @@ -170,7 +170,7 @@ def got_enough_data def get_confidence # This is just one way to calculate confidence. It works well for me. if @totalRel > MINIMUM_DATA_THRESHOLD - return (@totalRel - @relSample[0]) / @totalRel + return (@totalRel - @relSample[0]).to_f / @totalRel else return DONT_KNOW end diff --git a/lib/rchardet/latin1prober.rb b/lib/rchardet/latin1prober.rb index 56d8e45..78b2307 100644 --- a/lib/rchardet/latin1prober.rb +++ b/lib/rchardet/latin1prober.rb @@ -133,7 +133,7 @@ def get_confidence if total < 0.01 confidence = 0.0 else - confidence = (@freqCounter[3] / total) - (@freqCounter[1] * 20.0 / total) + confidence = (@freqCounter[3].to_f / total) - (@freqCounter[1] * 20.0 / total) end if confidence < 0.0 confidence = 0.0 diff --git a/lib/rchardet/sbcharsetprober.rb b/lib/rchardet/sbcharsetprober.rb index 973fcc8..fec9341 100644 --- a/lib/rchardet/sbcharsetprober.rb +++ b/lib/rchardet/sbcharsetprober.rb @@ -110,7 +110,7 @@ def feed(aBuf) def get_confidence r = 0.01 if @totalSeqs > 0 - r = (1.0 * @seqCounters[POSITIVE_CAT]) / @totalSeqs / @model['mTypicalPositiveRatio'] + r = @seqCounters[POSITIVE_CAT].to_f / @totalSeqs / @model['mTypicalPositiveRatio'] r = r * @freqChar / @totalChar if r >= 1.0 r = 0.99 diff --git a/test/complete_test.rb b/test/complete_test.rb index 52e4ece..915b8e3 100644 --- a/test/complete_test.rb +++ b/test/complete_test.rb @@ -31,6 +31,8 @@ assert u.result, "result should not be nil" assert u.result['encoding'], "encoding should not be nil, result: #{u.result.inspect}" + assert u.result['confidence'], "confidence should not be nil, result: #{u.result.inspect}" + assert_equal u.result['confidence'].class, Float assert_equal encoding, u.result['encoding'].downcase end end diff --git a/test/simple_assets/windows-1252.txt b/test/simple_assets/windows-1252.txt new file mode 100644 index 0000000..4ee6102 --- /dev/null +++ b/test/simple_assets/windows-1252.txt @@ -0,0 +1 @@ +1234567890,ASDF,JKL,123 WHEREVER AVE,SOMEWHERE TOWN,UBERLāNDIA,USER@EXAMPLE.COM diff --git a/test/simple_test.rb b/test/simple_test.rb index 68e1089..5b814a2 100644 --- a/test/simple_test.rb +++ b/test/simple_test.rb @@ -4,7 +4,9 @@ describe "Simple" do def assert_chardet_spec_detect(file, expected) content = File.open("test/simple_assets/#{file}.txt", 'rb'){|io| io.read } - assert_equal expected, CharDet.detect(content) + detected = CharDet.detect(content) + assert_equal expected, detected + assert_equal detected['confidence'].class, Float end def pending @@ -23,7 +25,7 @@ def pending it "detects Shift_JIS" do assert_chardet_spec_detect 'Shift_JIS', { - "encoding" => 'SHIFT_JIS', "confidence" => (RUBY_VERSION > "1.9.3" ? 0.99 : 1) # TODO the 1.9 value might be wrong but I cannot find any bug + "encoding" => 'SHIFT_JIS', "confidence" => (RUBY_VERSION > "1.9.3" ? 0.99 : 1.0) # TODO the 1.9 value might be wrong but I cannot find any bug } end @@ -41,13 +43,13 @@ def pending it "detects UTF_16BE" do assert_chardet_spec_detect 'UTF-16BE' , { - "encoding" => 'UTF-16BE', "confidence" => 1 + "encoding" => 'UTF-16BE', "confidence" => 1.0 } end it "detects UTF_16LE" do assert_chardet_spec_detect 'UTF-16LE' , { - "encoding" => 'UTF-16LE', "confidence" => 1 + "encoding" => 'UTF-16LE', "confidence" => 1.0 } end @@ -63,6 +65,13 @@ def pending } end + it "detects windows-1252" do + assert_chardet_spec_detect 'windows-1252' , { + # not perfect, but better than detecting nil + "encoding" => 'windows-1252', "confidence" => 0.36875 + } + end + it "detects russian" do # this failed when using $KCODE='u' on 1.8 ... just making sure it stays put CharDet.detect("Toto je zpr\xE1va ve form\xE1tu MIME s n\xECkolika \xE8\xE1stmi.\n")["encoding"].must_equal "windows-1251"