new: Add new tests for prodigy_to_tsv

ivyleavedtoadflax · ivyleavedtoadflax · commit 136df23ee0e4 · 2020-03-17T12:19:34.000-03:00
diff --git a/tests/prodigy/test_prodigy_entrypoints.py b/tests/prodigy/test_prodigy_entrypoints.py
@@ -35,6 +35,13 @@ def test_prodigy_to_tsv(tmpdir):
         respect_docs=True,
     )
 
+def test_prodigy_to_tsv_multiple_inputs(tmpdir):
+    prodigy_to_tsv(
+        TEST_TOKEN_LABELLED + "," + TEST_TOKEN_LABELLED,
+        os.path.join(tmpdir, "tokens.tsv"),
+        respect_lines=False,
+        respect_docs=True,
+    )
 
 def test_reach_to_prodigy(tmpdir):
     reach_to_prodigy(TEST_REACH, os.path.join(tmpdir, "prodigy.jsonl"))
diff --git a/tests/prodigy/test_prodigy_to_tsv.py b/tests/prodigy/test_prodigy_to_tsv.py
@@ -7,7 +7,7 @@
 import pytest
 
 from deep_reference_parser.io import read_jsonl
-from deep_reference_parser.prodigy.prodigy_to_tsv import TokenLabelPairs, prodigy_to_tsv
+from deep_reference_parser.prodigy.prodigy_to_tsv import TokenLabelPairs, prodigy_to_tsv, check_inputs
 
 from .common import TEST_SPANS, TEST_TOKENS
 
@@ -738,6 +738,41 @@ def test_reference_spans_real_example(doc):
     tlp = TokenLabelPairs(respect_line_endings=False)
     actual = tlp.run([doc])
 
-    import pprint
-
     assert actual == expected
+
+def test_check_input_exist_on_doc_mismatch():
+
+    dataset_a = [{"_input_hash": "a1"}, {"_input_hash": "a2"}]
+    dataset_b = [{"_input_hash": "b1"}, {"_input_hash": "b2"}]
+
+    with pytest.raises(SystemExit):
+        check_inputs([dataset_a, dataset_b])
+
+def test_check_input_exist_on_tokens_mismatch():
+
+    dataset_a = [
+        {"_input_hash": "a", "tokens": [{"text":"a"}]},
+        {"_input_hash": "a", "tokens": [{"text":"b"}]},
+    ]
+
+    dataset_b = [
+        {"_input_hash": "a", "tokens": [{"text":"b"}]},
+        {"_input_hash": "a", "tokens": [{"text":"b"}]},
+    ]
+
+    with pytest.raises(SystemExit):
+        check_inputs([dataset_a, dataset_b])
+
+def test_check_input():
+
+    dataset_a = [
+        {"_input_hash": "a", "tokens": [{"text":"a"}]},
+        {"_input_hash": "a", "tokens": [{"text":"b"}]},
+    ]
+
+    dataset_b = [
+        {"_input_hash": "a", "tokens": [{"text":"a"}]},
+        {"_input_hash": "a", "tokens": [{"text":"b"}]},
+    ]
+
+    assert check_inputs([dataset_a, dataset_b])