Fixed #432

chinyeungli · chinyeungli · commit 561d5cf32aa4 · 2020-08-06T16:13:13.000+08:00
* The issue describes in #432 is related to the Microsoft's invented UTF-8 variant. Fix the issue by updating the encoding to 'utf-8-sig'. See https://docs.python.org/3/library/codecs.html#module-encodings.utf_8_sig
diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst
@@ -4,6 +4,7 @@
 	* Enhance the `transform` to also work with JSON file
     * Update transform code (See #427 and #428)
     * Fixed #431 - Error handling for empty "_file" fields
+    * Fixed #432 - Handled UTF-8 variant invented by Microsoft 
 
 2020-05-05
     Release 4.0.2
diff --git a/src/attributecode/gen.py b/src/attributecode/gen.py
@@ -2,7 +2,7 @@
 # -*- coding: utf8 -*-
 
 # ============================================================================
-#  Copyright (c) 2013-2019 nexB Inc. http://www.nexb.com/ - All rights reserved.
+#  Copyright (c) 2013-2020 nexB Inc. http://www.nexb.com/ - All rights reserved.
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
@@ -48,8 +48,7 @@ def check_duplicated_columns(location):
     at location.
     """
     location = add_unc(location)
-    # FIXME: why errors=ignore?
-    with codecs.open(location, 'rb', encoding='utf-8', errors='ignore') as csvfile:
+    with codecs.open(location, 'rb', encoding='utf-8-sig', errors='replace') as csvfile:
         reader = csv.reader(csvfile)
         columns = next(reader)
         columns = [col for col in columns]
diff --git a/src/attributecode/util.py b/src/attributecode/util.py
@@ -272,7 +272,7 @@ def load_csv(location):
     """
     results = []
     # FIXME: why ignore encoding errors here?
-    with codecs.open(location, mode='rb', encoding='utf-8',
+    with codecs.open(location, mode='rb', encoding='utf-8-sig',
                      errors='ignore') as csvfile:
         for row in csv.DictReader(csvfile):
             # convert all the column keys to lower case
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -352,6 +352,17 @@ def test_format_about_dict_for_csv_output(self):
         output = util.format_about_dict_for_csv_output(about)
         assert output == expected
 
+    def test_load_csv_microsoft_utf_8(self):
+        test_file = get_test_loc('test_util/csv/test_ms_utf8.csv')
+        expected = [OrderedDict([(u'about_resource', u'/myFile'), (u'name', u'myName')])]
+        result = util.load_csv(test_file)
+        assert expected == result
+
+    def test_load_csv_utf_8(self):
+        test_file = get_test_loc('test_util/csv/test_utf8.csv')
+        expected = [OrderedDict([(u'about_resource', u'/myFile'), (u'name', u'\u540d')])]
+        result = util.load_csv(test_file)
+        assert expected == result
 
 class TestJson(unittest.TestCase):
 
diff --git a/tests/testdata/test_util/csv/test_ms_utf8.csv b/tests/testdata/test_util/csv/test_ms_utf8.csv
@@ -0,0 +1,2 @@
+﻿about_resource,name
+/myFile,myName
diff --git a/tests/testdata/test_util/csv/test_utf8.csv b/tests/testdata/test_util/csv/test_utf8.csv
@@ -0,0 +1,2 @@
+about_resource,name
+/myFile,名

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+about_resource,name`
	`2`	`+/myFile,myName`