Skip to content

Commit f017c80

Browse files
committed
Handle non-ascii characters properly in scancode-fingerprint #1690
Signed-off-by: Steven Esser <[email protected]>
1 parent 79af902 commit f017c80

File tree

2 files changed

+24
-2
lines changed

2 files changed

+24
-2
lines changed

plugins/scancode-fingerprint/src/plugin_fingerprint/fingerprint.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,11 @@
2525
import binascii
2626
from bitarray import bitarray
2727
from bitarray import bitdiff
28-
from licensedcode.tokenize import ngrams
2928
import hashlib
3029

30+
from commoncode.text import toascii
31+
from licensedcode.tokenize import ngrams
32+
3133
HASH_LENGTH = 128
3234
SHINGLE_LENGTH = 3
3335

@@ -103,6 +105,9 @@ def process_shingles(self, shingle, weighted_list):
103105
"""
104106
Modify weighted list wrt to shingle
105107
"""
108+
# convert other encodings to ascii. See #1690.
109+
shingle = toascii(shingle)
110+
106111
hash = hashlib.md5(shingle.encode()).digest()
107112
result = self.bitarray_from_bytes(hash)
108113

plugins/scancode-fingerprint/tests/test_fingerprint.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# -*- coding: utf-8 -*-
12
#
23
# Copyright (c) nexB Inc. and others. All rights reserved.
34
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
@@ -131,10 +132,18 @@ def test_hex_digest3(self):
131132
result = simhash.hex_digest()
132133
assert result == '7f43e1b18f9c0e705fcf28007bc41754'
133134

134-
def test_hex_digest3(self):
135+
def test_hex_digest4(self):
135136
simhash = Simhash()
136137
assert simhash.hex_digest() == None
137138

139+
# Ensure non-ascii characters are handled properly. See #1690.
140+
def test_hex_digest_non_ascii(self):
141+
simhash = Simhash()
142+
simhash.update('Copyright (c) Mário Morgado')
143+
144+
result = simhash.hex_digest()
145+
assert result == '01010040c1300a05ce41804024000001'
146+
138147
def test_update(self):
139148
simhash = Simhash()
140149
assert simhash.tokens == []
@@ -152,6 +161,14 @@ def test_generate_fingerprint(self):
152161
simhash.update('this will get added too!')
153162
expected = bitarray('00000010000000000011110010100000101000001111100000000001010110000110101110111000100000110101000000010100100000000010110011010010')
154163
assert simhash.generate_fingerprint() == expected
164+
165+
# Ensure non-ascii characters are handled properly. See #1690.
166+
def test_generate_fingerprint_non_ascii(self):
167+
simhash = Simhash()
168+
simhash.update('Copyright (c) Mário Morgado')
169+
170+
expected = bitarray('00000001000000010000000001000000110000010011000000001010000001011100111001000001100000000100000000100100000000000000000000000001')
171+
assert simhash.generate_fingerprint() == expected
155172

156173
def test_similarity_matching1(self):
157174
simhash1 = Simhash()

0 commit comments

Comments
 (0)