chg: linting with 🏴💋

ivyleavedtoadflax · ivyleavedtoadflax · commit 55bc5e26ffd3 · 2020-03-17T12:19:34.000-03:00
diff --git a/deep_reference_parser/prodigy/prodigy_to_tsv.py b/deep_reference_parser/prodigy/prodigy_to_tsv.py
@@ -183,30 +183,36 @@ def yield_token_label_pair(self, doc, lists=False):
 
                 token_counter += 1
 
+
 def get_document_hashes(dataset):
     """Get the hashes for every doc in a dataset and return as set
     """
     return set([doc["_input_hash"] for doc in dataset])
 
+
 def check_all_equal(lst):
     """Check that all items in a list are equal and return True or False
     """
     return not lst or lst.count(lst[0]) == len(lst)
 
+
 def hash_matches(doc, hash):
     """Check whether the hash of the passed doc matches the passed hash
     """
     return doc["_input_hash"] == hash
 
+
 def get_doc_by_hash(dataset, hash):
     """Return a doc from a dataset where hash matches doc["_input_hash"]
     Assumes there will only be one match!
     """
     return [doc for doc in dataset if doc["_input_hash"] == hash][0]
 
+
 def get_tokens(doc):
     return [token["text"] for token in doc["tokens"]]
 
+
 def check_inputs(annotated_data):
     """Checks whether two prodigy datasets contain the same docs (evaluated by
     doc["_input_hash"] and whether those docs contain the same tokens. This is
@@ -231,7 +237,9 @@ def check_inputs(annotated_data):
                 diff = set(doc_hashes[i]) ^ set(doc_hashes[j])
 
                 if diff:
-                    msg.fail(f"Docs {diff} unequal between dataset {i} and {j}", exits=1)
+                    msg.fail(
+                        f"Docs {diff} unequal between dataset {i} and {j}", exits=1
+                    )
 
     # Check that the tokens between the splitting and parsing docs match
 
@@ -245,19 +253,27 @@ def check_inputs(annotated_data):
 
     return True
 
+
 def sort_docs_list(lst):
     """Sort a list of prodigy docs by input hash
     """
-    return sorted(lst, key=lambda k: k['_input_hash'])
+    return sorted(lst, key=lambda k: k["_input_hash"])
+
 
 def combine_token_label_pairs(pairs):
     """Combines a list of [(token, label), (token, label)] to give
     (token,label,label).
     """
     return pairs[0][0:] + tuple(pair[1] for pair in pairs[1:])
 
+
 @plac.annotations(
-    input_files=("Comma separated list of paths to jsonl files containing prodigy docs.", "positional", None, str),
+    input_files=(
+        "Comma separated list of paths to jsonl files containing prodigy docs.",
+        "positional",
+        None,
+        str,
+    ),
     output_file=("Path to output tsv file.", "positional", None, str),
     respect_lines=(
         "Respect line endings? Or parse entire document in a single string?",
@@ -343,16 +359,22 @@ def prodigy_to_tsv(
     # NOTE: Use of reduce to handle pairs_list of unknown length
 
     if len(pairs_list) > 1:
-        merged_pairs = (combine_token_label_pairs(pairs) for pairs in reduce(zip, pairs_list))
-        example_pairs = [combine_token_label_pairs(pairs) for i, pairs in enumerate(reduce(zip, pairs_list)) if i < 15]
+        merged_pairs = (
+            combine_token_label_pairs(pairs) for pairs in reduce(zip, pairs_list)
+        )
+        example_pairs = [
+            combine_token_label_pairs(pairs)
+            for i, pairs in enumerate(reduce(zip, pairs_list))
+            if i < 15
+        ]
     else:
         merged_pairs = pairs_list[0]
         example_pairs = merged_pairs[0:14]
 
     with open(output_file, "w") as fb:
         writer = csv.writer(fb, delimiter="\t")
         # Write DOCSTART and a blank line
-        #writer.writerows([("DOCSTART", None), (None, None)])
+        # writer.writerows([("DOCSTART", None), (None, None)])
         writer.writerows(merged_pairs)
 
     # Print out the first ten rows as a sense check
diff --git a/tests/prodigy/test_prodigy_entrypoints.py b/tests/prodigy/test_prodigy_entrypoints.py
@@ -35,6 +35,7 @@ def test_prodigy_to_tsv(tmpdir):
         respect_docs=True,
     )
 
+
 def test_prodigy_to_tsv_multiple_inputs(tmpdir):
     prodigy_to_tsv(
         TEST_TOKEN_LABELLED + "," + TEST_TOKEN_LABELLED,
@@ -43,6 +44,7 @@ def test_prodigy_to_tsv_multiple_inputs(tmpdir):
         respect_docs=True,
     )
 
+
 def test_reach_to_prodigy(tmpdir):
     reach_to_prodigy(TEST_REACH, os.path.join(tmpdir, "prodigy.jsonl"))
 
diff --git a/tests/prodigy/test_prodigy_to_tsv.py b/tests/prodigy/test_prodigy_to_tsv.py
@@ -7,7 +7,11 @@
 import pytest
 
 from deep_reference_parser.io import read_jsonl
-from deep_reference_parser.prodigy.prodigy_to_tsv import TokenLabelPairs, prodigy_to_tsv, check_inputs
+from deep_reference_parser.prodigy.prodigy_to_tsv import (
+    TokenLabelPairs,
+    prodigy_to_tsv,
+    check_inputs,
+)
 
 from .common import TEST_SPANS, TEST_TOKENS
 
@@ -740,6 +744,7 @@ def test_reference_spans_real_example(doc):
 
     assert actual == expected
 
+
 def test_check_input_exist_on_doc_mismatch():
 
     dataset_a = [{"_input_hash": "a1"}, {"_input_hash": "a2"}]
@@ -748,31 +753,33 @@ def test_check_input_exist_on_doc_mismatch():
     with pytest.raises(SystemExit):
         check_inputs([dataset_a, dataset_b])
 
+
 def test_check_input_exist_on_tokens_mismatch():
 
     dataset_a = [
-        {"_input_hash": "a", "tokens": [{"text":"a"}]},
-        {"_input_hash": "a", "tokens": [{"text":"b"}]},
+        {"_input_hash": "a", "tokens": [{"text": "a"}]},
+        {"_input_hash": "a", "tokens": [{"text": "b"}]},
     ]
 
     dataset_b = [
-        {"_input_hash": "a", "tokens": [{"text":"b"}]},
-        {"_input_hash": "a", "tokens": [{"text":"b"}]},
+        {"_input_hash": "a", "tokens": [{"text": "b"}]},
+        {"_input_hash": "a", "tokens": [{"text": "b"}]},
     ]
 
     with pytest.raises(SystemExit):
         check_inputs([dataset_a, dataset_b])
 
+
 def test_check_input():
 
     dataset_a = [
-        {"_input_hash": "a", "tokens": [{"text":"a"}]},
-        {"_input_hash": "a", "tokens": [{"text":"b"}]},
+        {"_input_hash": "a", "tokens": [{"text": "a"}]},
+        {"_input_hash": "a", "tokens": [{"text": "b"}]},
     ]
 
     dataset_b = [
-        {"_input_hash": "a", "tokens": [{"text":"a"}]},
-        {"_input_hash": "a", "tokens": [{"text":"b"}]},
+        {"_input_hash": "a", "tokens": [{"text": "a"}]},
+        {"_input_hash": "a", "tokens": [{"text": "b"}]},
     ]
 
     assert check_inputs([dataset_a, dataset_b])

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ def test_prodigy_to_tsv(tmpdir):`
`35`	`35`	`respect_docs=True,`
`36`	`36`	`)`
`37`	`37`
	`38`	`+`
`38`	`39`	`def test_prodigy_to_tsv_multiple_inputs(tmpdir):`
`39`	`40`	`prodigy_to_tsv(`
`40`	`41`	`TEST_TOKEN_LABELLED + "," + TEST_TOKEN_LABELLED,`
`@@ -43,6 +44,7 @@ def test_prodigy_to_tsv_multiple_inputs(tmpdir):`
`43`	`44`	`respect_docs=True,`
`44`	`45`	`)`
`45`	`46`
	`47`	`+`
`46`	`48`	`def test_reach_to_prodigy(tmpdir):`
`47`	`49`	`reach_to_prodigy(TEST_REACH, os.path.join(tmpdir, "prodigy.jsonl"))`
`48`	`50`