Fixed #512 - Remove the needed to have the same format for input/output for transform

chinyeungli · chinyeungli · commit 04f9159a1fab · 2022-09-27T16:32:16.000+08:00
* Code have been enhanced and re-organized
 * Add more tests

Signed-off-by: Chin Yeung Li &lt;tli@nexb.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,6 +6,7 @@ Changelog
 
     * Fixed version mismatch (https://github.com/nexB/aboutcode-toolkit/issues/510)
     * Improve `check` performance (https://github.com/nexB/aboutcode-toolkit/issues/511)
+    * Relax the requirement to have the same format for input and output for `transform`
 
 
 2022-03-21
diff --git a/docs/source/reference.rst b/docs/source/reference.rst
@@ -595,8 +595,7 @@ Purpose
 -------
 
 Transform the CSV/JSON/XLSX file at LOCATION by applying renamings,
-filters and checks and then write a new CSV/JSON/Excel to OUTPUT
-(Format for input and output need to be the same).
+filters and checks and then write a new CSV/JSON/Excel to OUTPUT.
 
 Details
 ^^^^^^^
diff --git a/src/attributecode/cmd.py b/src/attributecode/cmd.py
@@ -41,9 +41,13 @@
 from attributecode.model import get_copy_list
 from attributecode.model import pre_process_and_fetch_license_dict
 from attributecode.model import write_output
-from attributecode.transform import transform_csv_to_csv
-from attributecode.transform import transform_json_to_json
-from attributecode.transform import transform_excel_to_excel
+from attributecode.transform import transform_data
+from attributecode.transform import transform_csv
+from attributecode.transform import transform_json
+from attributecode.transform import transform_excel
+from attributecode.transform import write_csv
+from attributecode.transform import write_json
+from attributecode.transform import write_excel
 from attributecode.transform import Transformer
 from attributecode.util import extract_zip
 from attributecode.util import filter_errors
@@ -771,8 +775,7 @@ def print_config_help(ctx, param, value):
 def transform(location, output, configuration, quiet, verbose):  # NOQA
     """
 Transform the CSV/JSON/XLSX file at LOCATION by applying renamings, filters and checks
-and then write a new CSV/JSON/XLSX to OUTPUT (Format for input and output need to be
-the same).
+and then write a new CSV/JSON/XLSX to OUTPUT.
 
 LOCATION: Path to a CSV/JSON/XLSX file.
 
@@ -783,16 +786,32 @@ def transform(location, output, configuration, quiet, verbose):  # NOQA
     else:
         transformer = Transformer.from_file(configuration)
 
-    if location.endswith('.csv') and output.endswith('.csv'):
-        errors = transform_csv_to_csv(location, output, transformer)
-    elif location.endswith('.json') and output.endswith('.json'):
-        errors = transform_json_to_json(location, output, transformer)
-    elif location.endswith('.xlsx') and output.endswith('.xlsx'):
-        errors = transform_excel_to_excel(location, output, transformer)
-    else:
-        msg = 'Extension for the input and output need to be the same.'
+    if not transformer:
+        msg = 'Cannot transform without Transformer'
         click.echo(msg)
-        sys.exit()
+        sys.exit(1)
+
+    errors = []
+    updated_data = []
+    new_data = []
+
+    if location.endswith('.csv'):
+        new_data, errors = transform_csv(location)
+    elif location.endswith('.json'):
+        errors = transform_json(location)
+    elif location.endswith('.xlsx'):
+        errors = transform_excel(location)
+
+    if not errors:
+        updated_data, errors = transform_data(new_data, transformer)
+
+    if not errors:
+        if output.endswith('.csv'):
+            write_csv(output, updated_data)
+        elif output.endswith('.json'):
+            write_json(output, updated_data)
+        else:
+            write_excel(output, updated_data)
 
     if not quiet:
         print_version()
diff --git a/src/attributecode/transform.py b/src/attributecode/transform.py
@@ -13,13 +13,11 @@
 #  limitations under the License.
 # ============================================================================
 
-import io
 import json
 from collections import Counter, OrderedDict
 from itertools import zip_longest
 
 import attr
-import itertools
 import openpyxl
 
 from attributecode import CRITICAL
@@ -28,19 +26,13 @@
 from attributecode.util import csv
 from attributecode.util import replace_tab_with_spaces
 
-
-def transform_csv_to_csv(location, output, transformer):
+def transform_csv(location):
     """
-    Read a CSV file at `location` and write a new CSV file at `output`. Apply
-    transformations using the `transformer` Transformer.
-    Return a list of Error objects.
+    Read a CSV file at `location` and convert data into list of dictionaries.
     """
-    if not transformer:
-        raise ValueError('Cannot transform without Transformer')
-
-    rows = read_csv_rows(location)
-
     errors = []
+    new_data = []
+    rows = read_csv_rows(location)
     data = iter(rows)
     names = next(rows)
     field_names = strip_trailing_fields_csv(names)
@@ -50,65 +42,39 @@ def transform_csv_to_csv(location, output, transformer):
         msg = u'Duplicated field name: %(name)s'
         for name in dupes:
             errors.append(Error(CRITICAL, msg % locals()))
-        return errors
 
-    # Convert to dicts
-    new_data = [dict(zip_longest(field_names, item)) for item in data]
+    if not errors:
+        # Convert to dicts
+        new_data = [dict(zip_longest(field_names, item)) for item in data]
 
-    field_names, updated_data, errors = transform_data(new_data, transformer)
-
-    if errors:
-        return errors
-    else:
-        write_csv(output, updated_data, field_names)
-        return []
+    return new_data, errors
 
 
-def transform_json_to_json(location, output, transformer):
+def transform_json(location):
     """
-    Read a JSON file at `location` and write a new JSON file at `output`. Apply
-    transformations using the `transformer` Transformer.
-    Return a list of Error objects.
+    Read a JSON file at `location` and convert data into list of dictionaries.
     """
-    if not transformer:
-        raise ValueError('Cannot transform without Transformer')
-
+    errors = []
+    new_data = []
     items = read_json(location)
     data = normalize_dict_data(items)
     new_data = strip_trailing_fields_json(data)
 
-    _field_names, updated_data, errors = transform_data(new_data, transformer)
-
-    if errors:
-        return errors
-    else:
-        write_json(output, updated_data)
-        return []
+    return new_data, errors
 
 
-def transform_excel_to_excel(location, output, transformer):
+def transform_excel(location):
     """
-    Read a XLSX file at `location` and write a new Excel file at `output`. Apply
-    transformations using the `transformer` Transformer.
-    Return a list of Error objects.
+    Read a XLSX file at `location` and convert data into list of dictionaries.
     """
-    if not transformer:
-        raise ValueError('Cannot transform without Transformer')
-
-    dupes, new_data = read_excel(location)
     errors = []
+    new_data = []
+    dupes, new_data = read_excel(location)
     if dupes:
         msg = u'Duplicated field name: %(name)s'
         for name in dupes:
             errors.append(Error(CRITICAL, msg % locals()))
-        return errors
-
-    _field_names, updated_data, errors = transform_data(new_data, transformer)
-    if errors:
-        return errors
-    else:
-        write_excel(output, updated_data)
-        return []
+    return new_data, errors
 
 
 def strip_trailing_fields_csv(names):
@@ -160,25 +126,18 @@ def transform_data(data, transformer):
     Return a tuple of:
        ([field names...], [transformed ordered dict...], [Error objects..])
     """
-    if not transformer:
-        return data
-
     renamed_field_data = transformer.apply_renamings(data)
 
-    field_names = renamed_field_data[0].keys()
-
     if transformer.field_filters:
         renamed_field_data = list(transformer.filter_fields(renamed_field_data))
-        field_names = [c for c in field_names if c in transformer.field_filters]
 
     if transformer.exclude_fields:
         renamed_field_data = list(transformer.filter_excluded(renamed_field_data))
-        field_names = [c for c in field_names if c not in transformer.exclude_fields]
 
     errors = transformer.check_required_fields(renamed_field_data)
     if errors:
-        return field_names, data, errors
-    return field_names, renamed_field_data, errors
+        return data, errors
+    return renamed_field_data, errors
 
 
 tranformer_config_help = '''
@@ -395,11 +354,11 @@ def read_json(location):
         return json.load(jsonfile)
 
 
-def write_csv(location, data, field_names):  # NOQA
+def write_csv(location, data):
     """
-    Write a CSV file at `location` the `data` list of ordered dicts using the
-    `field_names`.
+    Write a CSV file at `location` with the `data` which is a list of ordered dicts.
     """
+    field_names = list(data[0].keys())
     with open(location, 'w', encoding='utf-8', newline='\n', errors='replace') as csvfile:
         writer = csv.DictWriter(csvfile, fieldnames=field_names)
         writer.writeheader()
diff --git a/tests/test_cmd.py b/tests/test_cmd.py
@@ -136,8 +136,6 @@ def test_report_errors_with_verbose_flag(capsys):
         'DEBUG: msg4',
         'NOTSET: msg4'
     ]
-    print("@@@@@@@@@@@@@@@@@@@@@@@@")
-    print(out.splitlines(False))
     assert expected_out == out.splitlines(False)
     assert '' == err
 
@@ -334,10 +332,6 @@ def check_about_stdout(options, expected_loc, regen=False):
     with open(expected_file, 'r') as ef:
         expected = ef.read()
 
-    print("!!!!!!!!!!!!!!!!!!!!")
-    print(expected.splitlines(False))
-    print("#####################")
-    print(result.output.splitlines(False))
     assert expected.splitlines(False) == result.output.splitlines(False)
 
 
diff --git a/tests/test_transform.py b/tests/test_transform.py
@@ -26,6 +26,7 @@
 from attributecode.transform import strip_trailing_fields_json
 from attributecode.transform import Transformer
 from attributecode.transform import read_csv_rows, read_excel, read_json
+from attributecode.transform import transform_csv, transform_excel, transform_json
 
 
 class TransformTest(unittest.TestCase):
@@ -36,16 +37,12 @@ def test_transform_data_new_col(self):
         configuration = get_test_loc('test_transform/configuration_new_cols')
         transformer = Transformer.from_file(configuration)
 
-        field_name, data, err = transform_data(data, transformer)
+        data, err = transform_data(data, transformer)
 
-        expect_name = [u'path', u'about_resource', u'name', u'version', u'notes', u'temp']
         expected_data = [dict(OrderedDict([(u'path', u'/tmp/test.c'),
                                            (u'about_resource', u'/tmp/test.c'),
                                            (u'name', u'test.c'), (u'version', u'1'),
                                            (u'notes', u'test'), (u'temp', u'foo')]))]
-        assert len(field_name) == len(expect_name)
-        for name in field_name:
-            assert name in expect_name
         assert len(data) == len(expected_data)
         for d in data:
             assert dict(d) in expected_data
@@ -57,14 +54,11 @@ def test_transform_data(self):
         configuration = get_test_loc('test_transform/configuration')
         transformer = Transformer.from_file(configuration)
 
-        field_name, data, err = transform_data(data, transformer)
+        data, err = transform_data(data, transformer)
 
         expect_name = [u'about_resource', u'name', u'version']
         expected_data = [dict(OrderedDict([(u'about_resource', u'/tmp/test.c'), (u'name', u'test.c'), (u'version', u'1')]))]
 
-        assert len(field_name) == len(expect_name)
-        for name in field_name:
-            assert name in expect_name
         assert len(data) == len(expected_data)
         for d in data:
             assert dict(d) in expected_data
@@ -75,15 +69,12 @@ def test_transform_data_mutli_rows(self):
         configuration = get_test_loc('test_transform/configuration2')
         transformer = Transformer.from_file(configuration)
 
-        field_name, data, err = transform_data(data, transformer)
+        data, err = transform_data(data, transformer)
 
         expect_name = [u'about_resource', u'name', u'version']
         expected_data = [dict(OrderedDict([(u'about_resource', u'/tmp/test.c'), (u'name', u'test.c'), (u'version', u'v0.01')])),
                          dict(OrderedDict([(u'about_resource', u'/tmp/tmp.h'), (u'name', u'tmp.h'), (u'version', None)]))]
 
-        assert len(field_name) == len(expect_name)
-        for name in field_name:
-            assert name in expect_name
         assert len(data) == len(expected_data)
         for d in data:
             assert dict(d) in expected_data
@@ -173,3 +164,30 @@ def test_read_csv_rows(self):
                     ['/test.c', 'test.c', 'mit'],
                     ['/test2.c', 'test2.c', 'mit and apache-2.0']]
         assert list(data) == expected
+
+    def test_transform_csv(self):
+        test_file = get_test_loc('test_transform/input.csv')
+        data, err = transform_csv(test_file)
+        expected = [{'Directory/Filename': '/aboutcode-toolkit/',
+                     'Component': 'AboutCode-toolkit',
+                     'Confirmed Version': '123', 'notes': ''}]
+        assert len(err) == 0
+        assert data == expected
+
+    def test_transform_excel(self):
+        test_file = get_test_loc('test_transform/input.xlsx')
+        data, err = transform_excel(test_file)
+        expected = [OrderedDict([('Directory/Filename', '/aboutcode-toolkit/'),
+                                 ('Component', 'AboutCode-toolkit'),
+                                 ('Confirmed Version', 123), ('notes', '')])]
+        assert len(err) == 0
+        assert data == expected
+
+    def test_transform_json(self):
+        test_file = get_test_loc('test_transform/input.json')
+        data, err = transform_json(test_file)
+        expected = [{'Directory/Filename': '/aboutcode-toolkit/',
+                     'Component': 'AboutCode-toolkit',
+                     'Confirmed Version': '123', 'notes': ''}]
+        assert len(err) == 0
+        assert data == expected
diff --git a/tests/testdata/test_cmd/help/about_transform_help.txt b/tests/testdata/test_cmd/help/about_transform_help.txt
@@ -1,8 +1,7 @@
 Usage: about transform [OPTIONS] LOCATION OUTPUT
 
   Transform the CSV/JSON/XLSX file at LOCATION by applying renamings, filters
-  and checks and then write a new CSV/JSON/XLSX to OUTPUT (Format for input and
-  output need to be the same).
+  and checks and then write a new CSV/JSON/XLSX to OUTPUT.
 
   LOCATION: Path to a CSV/JSON/XLSX file.
 
diff --git a/tests/testdata/test_transform/input.csv b/tests/testdata/test_transform/input.csv
@@ -0,0 +1,2 @@
+Directory/Filename,Component,Confirmed Version,notes
+/aboutcode-toolkit/,AboutCode-toolkit,123,
diff --git a/tests/testdata/test_transform/input.json b/tests/testdata/test_transform/input.json
@@ -0,0 +1,8 @@
+[
+   {
+      "Directory/Filename": "/aboutcode-toolkit/",
+      "Component": "AboutCode-toolkit",
+      "Confirmed Version": "123",
+      "notes": ""
+   }
+]
diff --git a/tests/testdata/test_transform/input.xlsx b/tests/testdata/test_transform/input.xlsx

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Directory/Filename,Component,Confirmed Version,notes`
	`2`	`+/aboutcode-toolkit/,AboutCode-toolkit,123,`