Skip to content

Commit 561d5cf

Browse files
committed
Fixed #432
* The issue describes in #432 is related to the Microsoft's invented UTF-8 variant. Fix the issue by updating the encoding to 'utf-8-sig'. See https://docs.python.org/3/library/codecs.html#module-encodings.utf_8_sig
1 parent 2f48a6a commit 561d5cf

File tree

6 files changed

+19
-4
lines changed

6 files changed

+19
-4
lines changed

docs/CHANGELOG.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* Enhance the `transform` to also work with JSON file
55
* Update transform code (See #427 and #428)
66
* Fixed #431 - Error handling for empty "_file" fields
7+
* Fixed #432 - Handled UTF-8 variant invented by Microsoft
78

89
2020-05-05
910
Release 4.0.2

src/attributecode/gen.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# -*- coding: utf8 -*-
33

44
# ============================================================================
5-
# Copyright (c) 2013-2019 nexB Inc. http://www.nexb.com/ - All rights reserved.
5+
# Copyright (c) 2013-2020 nexB Inc. http://www.nexb.com/ - All rights reserved.
66
# Licensed under the Apache License, Version 2.0 (the "License");
77
# you may not use this file except in compliance with the License.
88
# You may obtain a copy of the License at
@@ -48,8 +48,7 @@ def check_duplicated_columns(location):
4848
at location.
4949
"""
5050
location = add_unc(location)
51-
# FIXME: why errors=ignore?
52-
with codecs.open(location, 'rb', encoding='utf-8', errors='ignore') as csvfile:
51+
with codecs.open(location, 'rb', encoding='utf-8-sig', errors='replace') as csvfile:
5352
reader = csv.reader(csvfile)
5453
columns = next(reader)
5554
columns = [col for col in columns]

src/attributecode/util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ def load_csv(location):
272272
"""
273273
results = []
274274
# FIXME: why ignore encoding errors here?
275-
with codecs.open(location, mode='rb', encoding='utf-8',
275+
with codecs.open(location, mode='rb', encoding='utf-8-sig',
276276
errors='ignore') as csvfile:
277277
for row in csv.DictReader(csvfile):
278278
# convert all the column keys to lower case

tests/test_util.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,17 @@ def test_format_about_dict_for_csv_output(self):
352352
output = util.format_about_dict_for_csv_output(about)
353353
assert output == expected
354354

355+
def test_load_csv_microsoft_utf_8(self):
356+
test_file = get_test_loc('test_util/csv/test_ms_utf8.csv')
357+
expected = [OrderedDict([(u'about_resource', u'/myFile'), (u'name', u'myName')])]
358+
result = util.load_csv(test_file)
359+
assert expected == result
360+
361+
def test_load_csv_utf_8(self):
362+
test_file = get_test_loc('test_util/csv/test_utf8.csv')
363+
expected = [OrderedDict([(u'about_resource', u'/myFile'), (u'name', u'\u540d')])]
364+
result = util.load_csv(test_file)
365+
assert expected == result
355366

356367
class TestJson(unittest.TestCase):
357368

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
about_resource,name
2+
/myFile,myName
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
about_resource,name
2+
/myFile,名

0 commit comments

Comments
 (0)