1+ # -*- coding: utf-8 -*-
12#
23# Copyright (c) nexB Inc. and others. All rights reserved.
34# http://nexb.com and https://github.com/nexB/scancode-toolkit/
@@ -131,10 +132,18 @@ def test_hex_digest3(self):
131132 result = simhash .hex_digest ()
132133 assert result == '7f43e1b18f9c0e705fcf28007bc41754'
133134
134- def test_hex_digest3 (self ):
135+ def test_hex_digest4 (self ):
135136 simhash = Simhash ()
136137 assert simhash .hex_digest () == None
137138
139+ # Ensure non-ascii characters are handled properly. See #1690.
140+ def test_hex_digest_non_ascii (self ):
141+ simhash = Simhash ()
142+ simhash .update ('Copyright (c) Mário Morgado' )
143+
144+ result = simhash .hex_digest ()
145+ assert result == '01010040c1300a05ce41804024000001'
146+
138147 def test_update (self ):
139148 simhash = Simhash ()
140149 assert simhash .tokens == []
@@ -152,6 +161,14 @@ def test_generate_fingerprint(self):
152161 simhash .update ('this will get added too!' )
153162 expected = bitarray ('00000010000000000011110010100000101000001111100000000001010110000110101110111000100000110101000000010100100000000010110011010010' )
154163 assert simhash .generate_fingerprint () == expected
164+
165+ # Ensure non-ascii characters are handled properly. See #1690.
166+ def test_generate_fingerprint_non_ascii (self ):
167+ simhash = Simhash ()
168+ simhash .update ('Copyright (c) Mário Morgado' )
169+
170+ expected = bitarray ('00000001000000010000000001000000110000010011000000001010000001011100111001000001100000000100000000100100000000000000000000000001' )
171+ assert simhash .generate_fingerprint () == expected
155172
156173 def test_similarity_matching1 (self ):
157174 simhash1 = Simhash ()
0 commit comments