Skip to content

Commit a89b8f0

Browse files
committed
Improve replacement of HTML entities #930
Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 7e8c029 commit a89b8f0

File tree

7 files changed

+42
-15
lines changed

7 files changed

+42
-15
lines changed

etc/scripts/testdata/livescan/expected.csv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ Resource,type,name,base_name,extension,size,date,sha1,md5,mime_type,file_type,pr
1212
/package.json,file,package.json,package,.json,2200,2017-10-03,918376afce796ef90eeda1d6695f2289c90491ac,1f66239a9b850c5e60a9382dbe2162d2,text/plain,"ASCII text, with very long lines",JSON,False,True,False,False,True,False,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1313
/package.json,,,,,,,,,,,,,,,,,,,,,,mit,15.00,MIT License,Permissive,MIT,http://opensource.org/licenses/mit-license.php,http://opensource.org/licenses/mit-license.php,https://enterprise.dejacode.com/urn/urn:dje:license:mit,MIT,https://spdx.org/licenses/MIT,24,24,mit_27.RULE,False,[u'mit'],,,,,,,,,,,,,,,,,,,
1414
/package.json,,,,,,,,,,,,,,,,,,,,,,mit,100.00,MIT License,Permissive,MIT,http://opensource.org/licenses/mit-license.php,http://opensource.org/licenses/mit-license.php,https://enterprise.dejacode.com/urn/urn:dje:license:mit,MIT,https://spdx.org/licenses/MIT,24,24,mit.LICENSE,False,[u'mit'],,,,,,,,,,,,,,,,,,,
15-
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,Copyright (c) 2012 LearnBoost &lt [email protected]&gt,,,,,,,,,,,,,,,,,,
16-
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,,LearnBoost &lt,,,,,,,,,,,,,,,,,
15+
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,Copyright (c) 2012 LearnBoost <[email protected]>,,,,,,,,,,,,,,,,,,
16+
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,,LearnBoost,,,,,,,,,,,,,,,,,
1717
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12,12,,,,,,[email protected],,,,,,,,,,,,,,,,
1818
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,16,16,,,,,,,https://github.com/visionmedia/node-cookie-signature.git,,,,,,,,,,,,,,,
1919
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,27,27,,,,,,,https://github.com/visionmedia/node-cookie-signature/issues,,,,,,,,,,,,,,,

src/cluecode/copyrights.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1795,8 +1795,8 @@ def prepare_text_line(line):
17951795
"""
17961796
Prepare a line of text for copyright detection.
17971797
"""
1798-
re_sub = re.sub
17991798
# FIXME: maintain the original character positions
1799+
re_sub = re.sub
18001800

18011801
# strip whitespace
18021802
line = line.strip()
@@ -1811,13 +1811,10 @@ def prepare_text_line(line):
18111811
# replace ('
18121812
line = line.replace(r'("', ' ')
18131813

1814-
# strip comment markers
1815-
# common comment characters
1816-
line = line.strip('\\/*#%;')
18171814
# un common comment line prefix in dos
18181815
line = re_sub('^rem\s+', ' ', line)
18191816
line = re_sub('^\@rem\s+', ' ', line)
1820-
# un common comment line prefix in autotools am/in
1817+
# less common comment line prefix in autotools am/in
18211818
line = re_sub('^dnl\s+', ' ', line)
18221819
# un common comment line prefix in man pages
18231820
line = re_sub('^\.\\\\"', ' ', line)
@@ -1832,7 +1829,9 @@ def prepare_text_line(line):
18321829
line = line.replace('&copy;', ' (c) ')
18331830
line = line.replace('&#169;', ' (c) ')
18341831
line = line.replace('&#xa9;', ' (c) ')
1832+
line = line.replace('&#XA9;', ' (c) ')
18351833
line = line.replace(u'\xa9', ' (c) ')
1834+
line = line.replace(u'\XA9', ' (c) ')
18361835
# FIXME: what is \xc2???
18371836
line = line.replace(u'\xc2', '')
18381837

@@ -1842,6 +1841,16 @@ def prepare_text_line(line):
18421841
line = line.replace(u'&#13;&#10;', ' ')
18431842
line = line.replace(u'&#13;', ' ')
18441843
line = line.replace(u'&#10;', ' ')
1844+
# spaces
1845+
line = line.replace(u'&ensp;', ' ')
1846+
line = line.replace(u'&emsp;', ' ')
1847+
line = line.replace(u'&thinsp;', ' ')
1848+
1849+
# common named entities
1850+
line = line.replace(u'&quot;', '"').replace(u'&#34;', '"')
1851+
line = line.replace(u'&amp;', '&').replace(u'&#38;', '&')
1852+
line = line.replace(u'&gt;', '>').replace(u'&#62;', '>')
1853+
line = line.replace(u'&lt;', '<').replace(u'&#60;', '<')
18451854

18461855
# normalize (possibly repeated) quotes to unique single quote '
18471856
# backticks ` and "
@@ -1851,8 +1860,10 @@ def prepare_text_line(line):
18511860
# quotes to space? but t'so will be wrecked
18521861
# line = line.replace(u"'", ' ')
18531862

1854-
# remove explicit \\n
1863+
# treat explicit CR, LF and tabs as space
18551864
line = line.replace("\\n", ' ')
1865+
line = line.replace("\\r", ' ')
1866+
line = line.replace("\\t", ' ')
18561867

18571868
# remove backslash
18581869
line = line.replace("\\", ' ')
@@ -1908,4 +1919,8 @@ def prepare_text_line(line):
19081919
# why?
19091920
line = lowercase_well_known_word(line)
19101921

1922+
# strip comment markers
1923+
# common comment characters
1924+
line = line.strip('\\/*#%;')
1925+
19111926
return line
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
\n\nCopyright (c) 2012 LearnBoost &lt;[email protected]&gt;\n\n
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
what:
2+
- copyrights
3+
- holders
4+
- holders_summary
5+
6+
copyrights:
7+
- Copyright (c) 2012 LearnBoost <[email protected]>
8+
holders:
9+
- LearnBoost
10+
holders_summary:
11+
- LearnBoost

tests/cluecode/data/ics/speex-include-speex/speex_types.h.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ what:
33
- holders
44
- holders_summary
55
copyrights:
6-
- (c) COPYRIGHT 1994-2002 by the Xiph.Org Foundation http://www.xiph.org/
6+
- (c) COPYRIGHT 1994-2002 by the Xiph.Org Foundation http://www.xiph.org
77
holders:
88
- the Xiph.Org Foundation
99
holders_summary:

tests/cluecode/data/ics/speex-libspeex/smallft.c.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ what:
33
- holders
44
- holders_summary
55
copyrights:
6-
- (c) COPYRIGHT 1994-2001 by the XIPHOPHORUS Company http://www.xiph.org/
6+
- (c) COPYRIGHT 1994-2001 by the XIPHOPHORUS Company http://www.xiph.org
77
holders:
88
- the XIPHOPHORUS Company
99
holders_summary:

tests/formattedcode/data/csv/livescan/expected.csv

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
Resource,type,name,base_name,extension,size,date,sha1,md5,mime_type,file_type,programming_language,is_binary,is_text,is_archive,is_media,is_source,is_script,files_count,dirs_count,size_count,scan_errors,license__key,license__score,license__short_name,license__category,license__owner,license__homepage_url,license__text_url,license__reference_url,license__spdx_license_key,license__spdx_url,start_line,end_line,matched_rule__identifier,matched_rule__license_choice,matched_rule__licenses,copyright,copyright_holder,email,url,package__type,package__name,package__version,package__primary_language,package__summary,package__description,package__size,package__release_date,package__authors,package__homepage_url,package__notes,package__download_urls,package__bug_tracking_url,package__vcs_repository,package__copyright_top_level
2-
/json2csv.rb,file,json2csv.rb,json2csv,.rb,1014,2017-10-03,92a83e5f8566bee7c83cf798c1b8912d609f56e0,380b7a5f483db7ace853b8f9dca5bfec,text/x-python,"Python script, ASCII text executable",Ruby,False,True,False,False,True,True,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2+
/json2csv.rb,file,json2csv.rb,json2csv,.rb,1014,2018-02-20,92a83e5f8566bee7c83cf798c1b8912d609f56e0,380b7a5f483db7ace853b8f9dca5bfec,text/x-python,"Python script, ASCII text executable",Ruby,False,True,False,False,True,True,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
33
/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,apache-2.0,89.53,Apache 2.0,Permissive,Apache Software Foundation,http://www.apache.org/licenses/,http://www.apache.org/licenses/LICENSE-2.0,https://enterprise.dejacode.com/urn/urn:dje:license:apache-2.0,Apache-2.0,https://spdx.org/licenses/Apache-2.0,5,14,apache-2.0_7.RULE,,apache-2.0,,,,,,,,,,,,,,,,,,,
44
/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,3,,,,Copyright (c) 2017 nexB Inc. and others.,,,,,,,,,,,,,,,,,,
55
/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,3,,,,,nexB Inc. and others.,,,,,,,,,,,,,,,,,
66
/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,4,,,,,,,http://nexb.com/,,,,,,,,,,,,,,,
77
/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,4,,,,,,,https://github.com/nexB/scancode-toolkit/,,,,,,,,,,,,,,,
88
/json2csv.rb,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,10,,,,,,,http://apache.org/licenses/LICENSE-2.0,,,,,,,,,,,,,,,
9-
/license,file,license,license,,679,2017-10-03,75c5490a718ddd45e40e0cc7ce0c756abc373123,b965a762efb9421cf1bf4405f336e278,text/plain,ASCII text,,False,True,False,False,False,False,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9+
/license,file,license,license,,679,2018-02-20,75c5490a718ddd45e40e0cc7ce0c756abc373123,b965a762efb9421cf1bf4405f336e278,text/plain,ASCII text,,False,True,False,False,False,False,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1010
/license,,,,,,,,,,,,,,,,,,,,,,gpl-2.0-plus,100.00,GPL 2.0 or later,Copyleft,Free Software Foundation (FSF),http://www.gnu.org/licenses/old-licenses/gpl-2.0-standalone.html,http://www.gnu.org/licenses/old-licenses/gpl-2.0-standalone.html,https://enterprise.dejacode.com/urn/urn:dje:license:gpl-2.0-plus,GPL-2.0+,https://spdx.org/licenses/GPL-2.0,1,12,gpl-2.0-plus.LICENSE,,gpl-2.0-plus,,,,,,,,,,,,,,,,,,,
11-
/package.json,file,package.json,package,.json,2200,2017-10-03,918376afce796ef90eeda1d6695f2289c90491ac,1f66239a9b850c5e60a9382dbe2162d2,text/plain,"ASCII text, with very long lines",JSON,False,True,False,False,True,False,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
11+
/package.json,file,package.json,package,.json,2200,2018-02-20,918376afce796ef90eeda1d6695f2289c90491ac,1f66239a9b850c5e60a9382dbe2162d2,text/plain,"ASCII text, with very long lines",JSON,False,True,False,False,True,False,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1212
/package.json,,,,,,,,,,,,,,,,,,,,,,mit,15.00,MIT License,Permissive,MIT,http://opensource.org/licenses/mit-license.php,http://opensource.org/licenses/mit-license.php,https://enterprise.dejacode.com/urn/urn:dje:license:mit,MIT,https://spdx.org/licenses/MIT,24,24,mit_27.RULE,,mit,,,,,,,,,,,,,,,,,,,
1313
/package.json,,,,,,,,,,,,,,,,,,,,,,mit,100.00,MIT License,Permissive,MIT,http://opensource.org/licenses/mit-license.php,http://opensource.org/licenses/mit-license.php,https://enterprise.dejacode.com/urn/urn:dje:license:mit,MIT,https://spdx.org/licenses/MIT,24,24,mit.LICENSE,,mit,,,,,,,,,,,,,,,,,,,
14-
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,Copyright (c) 2012 LearnBoost &lt [email protected]&gt,,,,,,,,,,,,,,,,,,
15-
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,,LearnBoost &lt,,,,,,,,,,,,,,,,,
14+
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,Copyright (c) 2012 LearnBoost <[email protected]>,,,,,,,,,,,,,,,,,,
15+
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,23,26,,,,,LearnBoost,,,,,,,,,,,,,,,,,
1616
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,12,12,,,,,,[email protected],,,,,,,,,,,,,,,,
1717
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,16,16,,,,,,,https://github.com/visionmedia/node-cookie-signature.git,,,,,,,,,,,,,,,
1818
/package.json,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,27,27,,,,,,,https://github.com/visionmedia/node-cookie-signature/issues,,,,,,,,,,,,,,,

0 commit comments

Comments
 (0)