Merge branch '781-jsonlines-plugin' into develop

steven-esser · steven-esser · commit 80fad494b23f · 2017-10-05T15:25:29.000-07:00
diff --git a/setup.py b/setup.py
@@ -219,6 +219,7 @@ def read(*names, **kwargs):
             'spdx-tv = formattedcode.format_spdx:write_spdx_tag_value',
             'spdx-rdf = formattedcode.format_spdx:write_spdx_rdf',
             'csv = formattedcode.format_csv:write_csv',
+            'jsonlines = formattedcode.format_jsonlines:write_jsonlines',
         ],
 
         # scancode_post_scan is an entry point for post_scan_plugins.
diff --git a/src/formattedcode/format_json.py b/src/formattedcode/format_json.py
@@ -66,5 +66,6 @@ def _write_json(files_count, version, notice, scanned_files, options, output_fil
     else:
         kwargs['separators'] = (',', ':',)
 
+    # FIXME: Why do we wrap the output in unicode? Test output when we do not wrap the output in unicode
     output_file.write(unicode(simplejson.dumps(scan, **kwargs)))
     output_file.write('\n')
diff --git a/src/formattedcode/format_jsonlines.py b/src/formattedcode/format_jsonlines.py
@@ -0,0 +1,60 @@
+#
+# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/scancode-toolkit/
+# The ScanCode software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode require an acknowledgment.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# When you publish or redistribute any data created with ScanCode or any ScanCode
+# derivative work, you must accompany this data with the following acknowledgment:
+#
+#  Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+#  ScanCode should be considered or used as legal advice. Consult an Attorney
+#  for any legal advice.
+#  ScanCode is a free software code scanning tool from nexB Inc. and others.
+#  Visit https://github.com/nexB/scancode-toolkit/ for support and download.
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+
+from collections import OrderedDict
+
+import simplejson
+
+from plugincode.output import scan_output_writer
+
+
+"""
+Output plugins to write scan results as JSON Lines.
+"""
+
+
+@scan_output_writer
+def write_jsonlines(files_count, version, notice, scanned_files, options, output_file, *args, **kwargs):
+    """
+    Write scan output formatted as JSON Lines.
+    """
+    header = dict(header=OrderedDict([
+        ('scancode_notice', notice),
+        ('scancode_version', version),
+        ('scancode_options', options),
+        ('files_count', files_count)
+    ]))
+
+    kwargs = dict(iterable_as_array=True, encoding='utf-8', separators=(',', ':',))
+
+    output_file.write(simplejson.dumps(header, **kwargs))
+    output_file.write('\n')
+
+    for scanned_file in scanned_files:
+        scanned_file_line = {'files': [scanned_file]}
+        output_file.write(simplejson.dumps(scanned_file_line, **kwargs))
+        output_file.write('\n')
diff --git a/src/scancode/cli_test_utils.py b/src/scancode/cli_test_utils.py
@@ -69,8 +69,7 @@ def check_json_scan(expected_file, result_file, regen=False, strip_dates=False):
 
 def _load_json_result(result_file):
     """
-    Load the result file as utf-8 JSON and strip test_dir prefix from
-    locations.
+    Load the result file as utf-8 JSON
     Sort the results by location.
     """
     with codecs.open(result_file, encoding='utf-8') as res:
diff --git a/tests/formattedcode/data/json/simple-expected.jsonlines b/tests/formattedcode/data/json/simple-expected.jsonlines
@@ -0,0 +1,40 @@
+[
+    {
+        "header": {
+            "scancode_notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.",
+            "scancode_version": "2.1.0.post69.536f354.dirty.20171004191716",
+            "scancode_options": {
+                "--info": true,
+                "--license-score": 0,
+                "--format": "jsonlines"
+            },
+            "files_count": 1
+        }
+    },
+    {
+        "files": [
+            {
+                "path": "simple/copyright_acme_c-c.c",
+                "type": "file",
+                "name": "copyright_acme_c-c.c",
+                "base_name": "copyright_acme_c-c",
+                "extension": ".c",
+                "date": "2017-10-03",
+                "size": 55,
+                "sha1": "e2466d5b764d27fb301ceb439ffb5da22e43ab1d",
+                "md5": "bdf7c572beb4094c2059508fa73c05a4",
+                "files_count": null,
+                "mime_type": "text/plain",
+                "file_type": "UTF-8 Unicode text, with no line terminators",
+                "programming_language": "C",
+                "is_binary": false,
+                "is_text": true,
+                "is_archive": false,
+                "is_media": false,
+                "is_source": true,
+                "is_script": false,
+                "scan_errors": []
+            }
+        ]
+    }
+]
diff --git a/tests/formattedcode/test_format_jsonlines.py b/tests/formattedcode/test_format_jsonlines.py
@@ -0,0 +1,102 @@
+#
+# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/scancode-toolkit/
+# The ScanCode software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode require an acknowledgment.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# When you publish or redistribute any data created with ScanCode or any ScanCode
+# derivative work, you must accompany this data with the following acknowledgment:
+#
+#  Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+#  ScanCode should be considered or used as legal advice. Consult an Attorney
+#  for any legal advice.
+#  ScanCode is a free software code scanning tool from nexB Inc. and others.
+#  Visit https://github.com/nexB/scancode-toolkit/ for support and download.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+from __future__ import unicode_literals
+
+import codecs
+from collections import OrderedDict
+import json
+import os
+
+from commoncode.testcase import FileDrivenTesting
+from scancode.cli_test_utils import run_scan_click
+
+
+test_env = FileDrivenTesting()
+test_env.test_data_dir = os.path.join(os.path.dirname(__file__), 'data')
+
+
+def remove_variable_data(scan_result):
+    """
+    Remove variable fields from scan, such as date, version to ensure that the
+    test data is stable.
+    """
+    for line in scan_result:
+        header = line.get('header')
+        if header:
+            del header['scancode_version']
+        for scanned_file in line.get('files', []):
+            if 'date' in scanned_file:
+                del scanned_file['date']
+
+
+def check_jsonlines_scan(expected_file, result_file, regen=False):
+    """
+    Check the scan result_file JSON Lines results against the expected_file
+    expected JSON results, which is a list of mappings, one per line. If regen
+    is True the expected_file WILL BE overwritten with the results. This is
+    convenient for updating tests expectations. But use with caution.
+    """
+    result = _load_jsonlines_result(result_file)
+    remove_variable_data(result)
+
+    if regen:
+        with open(expected_file, 'wb') as reg:
+            json.dump(result, reg)
+
+    expected = _load_json_result(expected_file)
+    remove_variable_data(expected)
+
+    assert expected == result
+
+
+def _load_jsonlines_result(result_file):
+    """
+    Load the result file as utf-8 JSON Lines
+    """
+    with codecs.open(result_file, encoding='utf-8') as res:
+        return [json.loads(line, object_pairs_hook=OrderedDict) for line in res]
+
+
+def _load_json_result(result_file):
+    """
+    Load the result file as utf-8 JSON
+    """
+    with codecs.open(result_file, encoding='utf-8') as res:
+        return json.load(res, object_pairs_hook=OrderedDict)
+
+
+def test_jsonlines():
+    test_dir = test_env.get_test_loc('json/simple')
+    result_file = test_env.get_temp_file('jsonline')
+
+    result = run_scan_click(['-i', '--format', 'jsonlines', test_dir, result_file])
+    assert result.exit_code == 0
+    assert 'Scanning done' in result.output
+
+    expected = test_env.get_test_loc('json/simple-expected.jsonlines')
+    check_jsonlines_scan(test_env.get_test_loc(expected), result_file, regen=False)