feat: add Translator.create_glossary_from_csv() to create glossaries using CSV format.

daniel-jones-dev · daniel-jones-dev · commit 248bce56856e · 2022-06-29T09:55:25.000+02:00
Also add CSV support to CLI.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 ### Added
+* Add `Translator.create_glossary_from_csv()` allowing glossaries downloaded
+  from website to be easily uploaded to API.
 ### Changed
 ### Deprecated
 ### Removed
diff --git a/README.md b/README.md
@@ -266,6 +266,25 @@ print(
 # Example: Created 'My glossary' (559192ed-8e23-...) EN->DE containing 2 entries
 ```
 
+You can also upload a glossary downloaded from the DeepL website using
+`create_glossary_from_csv()`. Instead of supplying the entries as a dictionary,
+specify the CSV data as `csv_data` either as a file-like object or string or
+bytes containing file content:
+
+```python
+with open('/path/to/glossary_file.csv', 'r') as csv_file:
+    csv_data = csv_file.read()  # Read the file contents as a string
+    my_csv_glossary = translator.create_glossary_from_csv(
+        "CSV glossary",
+        source_lang="EN",
+        target_lang="DE",
+        csv_data=csv_data,
+    )
+```
+
+The [API documentation][api-docs-csv-format] explains the expected CSV format in
+detail.
+
 #### Getting, listing and deleting stored glossaries
 
 Functions to get, list, and delete stored glossaries are also provided:
@@ -522,6 +541,8 @@ environment variables defined referring to the mock-server.
 
 [api-docs]: https://www.deepl.com/docs-api?utm_source=github&utm_medium=github-python-readme
 
+[api-docs-csv-format]: https://www.deepl.com/docs-api/managing-glossaries/supported-glossary-formats/?utm_source=github&utm_medium=github-python-readme
+
 [api-docs-xml-handling]: https://www.deepl.com/docs-api/handling-xml/?utm_source=github&utm_medium=github-python-readme
 
 [api-docs-lang-list]: https://www.deepl.com/docs-api/translating-text/?utm_source=github&utm_medium=github-python-readme
diff --git a/deepl/__main__.py b/deepl/__main__.py
@@ -86,22 +86,37 @@ def action_glossary(
 
 
 def action_glossary_create(
-    translator: deepl.Translator, entry_list, file, **kwargs
+    translator: deepl.Translator, entry_list, file, csv, **kwargs
 ):
+    term_separator = None
     if file:
         if entry_list:
             raise deepl.DeepLException(
                 "The --file argument cannot be used together with "
                 "command-line entries"
             )
-        file_contents = pathlib.Path(file).read_text("UTF-8")
-        entry_dict = deepl.convert_tsv_to_dict(file_contents)
+        content = pathlib.Path(file).read_text("UTF-8")
     elif entry_list and entry_list[0] == "-":
-        entry_dict = deepl.convert_tsv_to_dict(sys.stdin.read())
+        content = sys.stdin.read()
     else:
-        entry_dict = deepl.convert_tsv_to_dict("\n".join(entry_list), "=")
+        content = "\n".join(entry_list)
+        term_separator = "="
+        if csv:
+            raise Exception(
+                "csv option is not compatible with command-line entries"
+            )
+
+    if csv:
+        glossary = translator.create_glossary_from_csv(
+            csv_data=content, **kwargs
+        )
+    else:
+        if term_separator:
+            entry_dict = deepl.convert_tsv_to_dict(content, term_separator)
+        else:
+            entry_dict = deepl.convert_tsv_to_dict(content)
+        glossary = translator.create_glossary(entries=entry_dict, **kwargs)
 
-    glossary = translator.create_glossary(entries=entry_dict, **kwargs)
     print(f"Created {glossary}")
     print_glossaries([glossary])
 
@@ -367,8 +382,8 @@ def add_common_arguments(subparser: argparse.ArgumentParser):
     parser_glossary_create = glossary_subparsers.add_parser(
         "create",
         help="create a new glossary",
-        description="create a new glossary using entries specified in "
-        "a TSV file, standard-input, or provided via command-line",
+        description="create a new glossary using entries provided via command-"
+        "line, standard-input, or specified in a TSV or CSV file",
     )
     parser_glossary_create.add_argument(
         "--name", required=True, help="name to be associated with glossary."
@@ -393,17 +408,25 @@ def add_common_arguments(subparser: argparse.ArgumentParser):
         type=str,
         metavar="SOURCE=TARGET",
         help="one or more entries to add to glossary, may be repeated. "
-        'Alternatively, use "-" to read entries from standard-input in TSV '
-        "format (see --file argument). These arguments cannot be used "
-        "together with the --file argument.",
+        'Alternatively, use "-" to read entries from standard-input in TSV or '
+        "CSV format (see --file argument for formatting information). These "
+        "arguments cannot be used together with the --file argument.",
     )
     parser_glossary_create.add_argument(
         "--file",
         type=str,
-        help="file to read glossary entries from. File must be in "
-        "tab-separated values (TSV) format: one entry-pair per line, each "
-        "line contains the source entry, a tab, then the target entry. Empty "
-        "lines are ignored.",
+        help="file to read glossary entries from. Unless --csv is specified, "
+        "file format is expected to be tab-separated values (TSV) format: one "
+        "entry-pair per line, each line contains the source entry, a tab, "
+        "then the target entry. Empty lines are ignored.",
+    )
+    parser_glossary_create.add_argument(
+        "--csv",
+        action="store_true",
+        help="the provided --file option or standard-input should be "
+        "interpreted as a CSV file. Information about the expected CSV format "
+        "can be found in the API documentation: "
+        "https://www.deepl.com/docs-api/managing-glossaries/supported-glossary-formats/.",  # noqa
     )
 
     parser_glossary_list = glossary_subparsers.add_parser(
diff --git a/deepl/translator.py b/deepl/translator.py
@@ -1173,6 +1173,49 @@ def create_glossary(
             util.convert_dict_to_tsv(entries),
         )
 
+    def create_glossary_from_csv(
+        self,
+        name: str,
+        source_lang: Union[str, Language],
+        target_lang: Union[str, Language],
+        csv_data: Union[TextIO, BinaryIO, str, bytes, Any],
+    ) -> GlossaryInfo:
+        """Creates a glossary with given name for the source and target
+        languages, containing the entries in the given CSV data.
+        The glossary may be used in the translate_text functions.
+
+        Only certain language pairs are supported. The available language pairs
+        can be queried using get_glossary_languages(). Glossaries are not
+        regional specific: a glossary with target language EN may be used to
+        translate texts into both EN-US and EN-GB.
+
+        This function allows you to upload a glossary CSV file that you have
+        downloaded from the DeepL website.
+
+        Information about the expected CSV format can be found in the API
+        documentation: https://www.deepl.com/docs-api/managing-glossaries/supported-glossary-formats/  # noqa
+
+        :param name: user-defined name to attach to glossary.
+        :param source_lang: Language of source terms.
+        :param target_lang: Language of target terms.
+        :param csv_data: CSV data containing glossary entries, either as a
+            file-like object or string or bytes containing file content.
+        :return: GlossaryInfo containing information about created glossary.
+
+        :raises ValueError: If the glossary name is empty, or entries are
+            empty or invalid.
+        :raises DeepLException: If source and target language pair are not
+            supported for glossaries.
+        """
+
+        entries = (
+            csv_data if isinstance(csv_data, (str, bytes)) else csv_data.read()
+        )
+
+        return self._create_glossary(
+            name, source_lang, target_lang, "csv", entries
+        )
+
     def get_glossary(self, glossary_id: str) -> GlossaryInfo:
         """Retrieves GlossaryInfo for the glossary with specified ID.
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -296,6 +296,27 @@ def example_document_path(tmpdir):
     return path
 
 
+@pytest.fixture
+def example_glossary_csv(tmpdir):
+    tmpdir = pathlib.Path(tmpdir)
+    content = (
+        "sourceEntry1,targetEntry1,en,de\n"
+        '"source""Entry","target,Entry",en,de'
+    )
+    path = tmpdir / "glossary" / "example_glossary.csv"
+    path.parent.mkdir()
+    path.write_text(content)
+    return path
+
+
+@pytest.fixture
+def example_glossary_csv_entries():
+    return {
+        "sourceEntry1": "targetEntry1",
+        'source"Entry': "target,Entry",
+    }
+
+
 @pytest.fixture
 def example_document_translation():
     return example_text["DE"]
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -204,11 +204,16 @@ def test_glossary_no_subcommand(runner):
 
 
 def test_glossary_create(
-    runner, glossary_name, tmpdir, cleanup_matching_glossaries
+    runner,
+    glossary_name,
+    tmpdir,
+    cleanup_matching_glossaries,
+    example_glossary_csv,
 ):
     name_cli = f"{glossary_name}-cli"
     name_stdin = f"{glossary_name}-stdin"
     name_file = f"{glossary_name}-file"
+    name_csv = f"{glossary_name}-csv"
     entries = {"Hallo": "Hello", "Maler": "Artist"}
     entries_tsv = deepl.convert_dict_to_tsv(entries)
     entries_cli = "\n".join(f"{s}={t}" for s, t in entries.items())
@@ -240,6 +245,14 @@ def test_glossary_create(
         assert (
             result.exit_code == 0
         ), f"exit: {result.exit_code}\n {result.output}"
+        result = runner.invoke(
+            main_function,
+            f'-vv glossary create --name "{name_csv}" --from EN --to DE '
+            f"--file {example_glossary_csv} --csv",
+        )
+        assert (
+            result.exit_code == 0
+        ), f"exit: {result.exit_code}\n {result.output}"
 
         result = runner.invoke(main_function, "-vv glossary list")
         assert (
@@ -248,6 +261,7 @@ def test_glossary_create(
         assert name_cli in result.output
         assert name_stdin in result.output
         assert name_file in result.output
+        assert name_csv in result.output
 
         # Cannot use --file option together with entries
         result = runner.invoke(
@@ -262,7 +276,8 @@ def test_glossary_create(
 
     finally:
         cleanup_matching_glossaries(
-            lambda glossary: glossary.name in [name_file, name_cli, name_stdin]
+            lambda glossary: glossary.name
+            in [name_file, name_cli, name_stdin, name_csv]
         )
 
 
diff --git a/tests/test_glossary.py b/tests/test_glossary.py
@@ -38,6 +38,30 @@ def test_glossary_create(
         )
 
 
+def test_glossary_create_csv(
+    translator,
+    glossary_name,
+    cleanup_matching_glossaries,
+    example_glossary_csv,
+    example_glossary_csv_entries,
+):
+    source_lang = "EN"
+    target_lang = "DE"
+    try:
+        with open(example_glossary_csv, "r") as csv_data:
+            glossary = translator.create_glossary_from_csv(
+                glossary_name, source_lang, target_lang, csv_data=csv_data
+            )
+        assert glossary.entry_count == len(example_glossary_csv_entries)
+
+        entries = translator.get_glossary_entries(glossary.glossary_id)
+        assert entries == example_glossary_csv_entries
+    finally:
+        cleanup_matching_glossaries(
+            lambda glossary: glossary.name == glossary_name
+        )
+
+
 def test_glossary_create_invalid(
     translator, glossary_name, cleanup_matching_glossaries
 ):