Skip to content

Commit 248bce5

Browse files
feat: add Translator.create_glossary_from_csv() to create glossaries using CSV format.
Also add CSV support to CLI.
1 parent 4ae0874 commit 248bce5

File tree

7 files changed

+166
-17
lines changed

7 files changed

+166
-17
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99
### Added
10+
* Add `Translator.create_glossary_from_csv()` allowing glossaries downloaded
11+
from website to be easily uploaded to API.
1012
### Changed
1113
### Deprecated
1214
### Removed

README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,25 @@ print(
266266
# Example: Created 'My glossary' (559192ed-8e23-...) EN->DE containing 2 entries
267267
```
268268

269+
You can also upload a glossary downloaded from the DeepL website using
270+
`create_glossary_from_csv()`. Instead of supplying the entries as a dictionary,
271+
specify the CSV data as `csv_data` either as a file-like object or string or
272+
bytes containing file content:
273+
274+
```python
275+
with open('/path/to/glossary_file.csv', 'r') as csv_file:
276+
csv_data = csv_file.read() # Read the file contents as a string
277+
my_csv_glossary = translator.create_glossary_from_csv(
278+
"CSV glossary",
279+
source_lang="EN",
280+
target_lang="DE",
281+
csv_data=csv_data,
282+
)
283+
```
284+
285+
The [API documentation][api-docs-csv-format] explains the expected CSV format in
286+
detail.
287+
269288
#### Getting, listing and deleting stored glossaries
270289

271290
Functions to get, list, and delete stored glossaries are also provided:
@@ -522,6 +541,8 @@ environment variables defined referring to the mock-server.
522541

523542
[api-docs]: https://www.deepl.com/docs-api?utm_source=github&utm_medium=github-python-readme
524543

544+
[api-docs-csv-format]: https://www.deepl.com/docs-api/managing-glossaries/supported-glossary-formats/?utm_source=github&utm_medium=github-python-readme
545+
525546
[api-docs-xml-handling]: https://www.deepl.com/docs-api/handling-xml/?utm_source=github&utm_medium=github-python-readme
526547

527548
[api-docs-lang-list]: https://www.deepl.com/docs-api/translating-text/?utm_source=github&utm_medium=github-python-readme

deepl/__main__.py

Lines changed: 38 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -86,22 +86,37 @@ def action_glossary(
8686

8787

8888
def action_glossary_create(
89-
translator: deepl.Translator, entry_list, file, **kwargs
89+
translator: deepl.Translator, entry_list, file, csv, **kwargs
9090
):
91+
term_separator = None
9192
if file:
9293
if entry_list:
9394
raise deepl.DeepLException(
9495
"The --file argument cannot be used together with "
9596
"command-line entries"
9697
)
97-
file_contents = pathlib.Path(file).read_text("UTF-8")
98-
entry_dict = deepl.convert_tsv_to_dict(file_contents)
98+
content = pathlib.Path(file).read_text("UTF-8")
9999
elif entry_list and entry_list[0] == "-":
100-
entry_dict = deepl.convert_tsv_to_dict(sys.stdin.read())
100+
content = sys.stdin.read()
101101
else:
102-
entry_dict = deepl.convert_tsv_to_dict("\n".join(entry_list), "=")
102+
content = "\n".join(entry_list)
103+
term_separator = "="
104+
if csv:
105+
raise Exception(
106+
"csv option is not compatible with command-line entries"
107+
)
108+
109+
if csv:
110+
glossary = translator.create_glossary_from_csv(
111+
csv_data=content, **kwargs
112+
)
113+
else:
114+
if term_separator:
115+
entry_dict = deepl.convert_tsv_to_dict(content, term_separator)
116+
else:
117+
entry_dict = deepl.convert_tsv_to_dict(content)
118+
glossary = translator.create_glossary(entries=entry_dict, **kwargs)
103119

104-
glossary = translator.create_glossary(entries=entry_dict, **kwargs)
105120
print(f"Created {glossary}")
106121
print_glossaries([glossary])
107122

@@ -367,8 +382,8 @@ def add_common_arguments(subparser: argparse.ArgumentParser):
367382
parser_glossary_create = glossary_subparsers.add_parser(
368383
"create",
369384
help="create a new glossary",
370-
description="create a new glossary using entries specified in "
371-
"a TSV file, standard-input, or provided via command-line",
385+
description="create a new glossary using entries provided via command-"
386+
"line, standard-input, or specified in a TSV or CSV file",
372387
)
373388
parser_glossary_create.add_argument(
374389
"--name", required=True, help="name to be associated with glossary."
@@ -393,17 +408,25 @@ def add_common_arguments(subparser: argparse.ArgumentParser):
393408
type=str,
394409
metavar="SOURCE=TARGET",
395410
help="one or more entries to add to glossary, may be repeated. "
396-
'Alternatively, use "-" to read entries from standard-input in TSV '
397-
"format (see --file argument). These arguments cannot be used "
398-
"together with the --file argument.",
411+
'Alternatively, use "-" to read entries from standard-input in TSV or '
412+
"CSV format (see --file argument for formatting information). These "
413+
"arguments cannot be used together with the --file argument.",
399414
)
400415
parser_glossary_create.add_argument(
401416
"--file",
402417
type=str,
403-
help="file to read glossary entries from. File must be in "
404-
"tab-separated values (TSV) format: one entry-pair per line, each "
405-
"line contains the source entry, a tab, then the target entry. Empty "
406-
"lines are ignored.",
418+
help="file to read glossary entries from. Unless --csv is specified, "
419+
"file format is expected to be tab-separated values (TSV) format: one "
420+
"entry-pair per line, each line contains the source entry, a tab, "
421+
"then the target entry. Empty lines are ignored.",
422+
)
423+
parser_glossary_create.add_argument(
424+
"--csv",
425+
action="store_true",
426+
help="the provided --file option or standard-input should be "
427+
"interpreted as a CSV file. Information about the expected CSV format "
428+
"can be found in the API documentation: "
429+
"https://www.deepl.com/docs-api/managing-glossaries/supported-glossary-formats/.", # noqa
407430
)
408431

409432
parser_glossary_list = glossary_subparsers.add_parser(

deepl/translator.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1173,6 +1173,49 @@ def create_glossary(
11731173
util.convert_dict_to_tsv(entries),
11741174
)
11751175

1176+
def create_glossary_from_csv(
1177+
self,
1178+
name: str,
1179+
source_lang: Union[str, Language],
1180+
target_lang: Union[str, Language],
1181+
csv_data: Union[TextIO, BinaryIO, str, bytes, Any],
1182+
) -> GlossaryInfo:
1183+
"""Creates a glossary with given name for the source and target
1184+
languages, containing the entries in the given CSV data.
1185+
The glossary may be used in the translate_text functions.
1186+
1187+
Only certain language pairs are supported. The available language pairs
1188+
can be queried using get_glossary_languages(). Glossaries are not
1189+
regional specific: a glossary with target language EN may be used to
1190+
translate texts into both EN-US and EN-GB.
1191+
1192+
This function allows you to upload a glossary CSV file that you have
1193+
downloaded from the DeepL website.
1194+
1195+
Information about the expected CSV format can be found in the API
1196+
documentation: https://www.deepl.com/docs-api/managing-glossaries/supported-glossary-formats/ # noqa
1197+
1198+
:param name: user-defined name to attach to glossary.
1199+
:param source_lang: Language of source terms.
1200+
:param target_lang: Language of target terms.
1201+
:param csv_data: CSV data containing glossary entries, either as a
1202+
file-like object or string or bytes containing file content.
1203+
:return: GlossaryInfo containing information about created glossary.
1204+
1205+
:raises ValueError: If the glossary name is empty, or entries are
1206+
empty or invalid.
1207+
:raises DeepLException: If source and target language pair are not
1208+
supported for glossaries.
1209+
"""
1210+
1211+
entries = (
1212+
csv_data if isinstance(csv_data, (str, bytes)) else csv_data.read()
1213+
)
1214+
1215+
return self._create_glossary(
1216+
name, source_lang, target_lang, "csv", entries
1217+
)
1218+
11761219
def get_glossary(self, glossary_id: str) -> GlossaryInfo:
11771220
"""Retrieves GlossaryInfo for the glossary with specified ID.
11781221

tests/conftest.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,27 @@ def example_document_path(tmpdir):
296296
return path
297297

298298

299+
@pytest.fixture
300+
def example_glossary_csv(tmpdir):
301+
tmpdir = pathlib.Path(tmpdir)
302+
content = (
303+
"sourceEntry1,targetEntry1,en,de\n"
304+
'"source""Entry","target,Entry",en,de'
305+
)
306+
path = tmpdir / "glossary" / "example_glossary.csv"
307+
path.parent.mkdir()
308+
path.write_text(content)
309+
return path
310+
311+
312+
@pytest.fixture
313+
def example_glossary_csv_entries():
314+
return {
315+
"sourceEntry1": "targetEntry1",
316+
'source"Entry': "target,Entry",
317+
}
318+
319+
299320
@pytest.fixture
300321
def example_document_translation():
301322
return example_text["DE"]

tests/test_cli.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,11 +204,16 @@ def test_glossary_no_subcommand(runner):
204204

205205

206206
def test_glossary_create(
207-
runner, glossary_name, tmpdir, cleanup_matching_glossaries
207+
runner,
208+
glossary_name,
209+
tmpdir,
210+
cleanup_matching_glossaries,
211+
example_glossary_csv,
208212
):
209213
name_cli = f"{glossary_name}-cli"
210214
name_stdin = f"{glossary_name}-stdin"
211215
name_file = f"{glossary_name}-file"
216+
name_csv = f"{glossary_name}-csv"
212217
entries = {"Hallo": "Hello", "Maler": "Artist"}
213218
entries_tsv = deepl.convert_dict_to_tsv(entries)
214219
entries_cli = "\n".join(f"{s}={t}" for s, t in entries.items())
@@ -240,6 +245,14 @@ def test_glossary_create(
240245
assert (
241246
result.exit_code == 0
242247
), f"exit: {result.exit_code}\n {result.output}"
248+
result = runner.invoke(
249+
main_function,
250+
f'-vv glossary create --name "{name_csv}" --from EN --to DE '
251+
f"--file {example_glossary_csv} --csv",
252+
)
253+
assert (
254+
result.exit_code == 0
255+
), f"exit: {result.exit_code}\n {result.output}"
243256

244257
result = runner.invoke(main_function, "-vv glossary list")
245258
assert (
@@ -248,6 +261,7 @@ def test_glossary_create(
248261
assert name_cli in result.output
249262
assert name_stdin in result.output
250263
assert name_file in result.output
264+
assert name_csv in result.output
251265

252266
# Cannot use --file option together with entries
253267
result = runner.invoke(
@@ -262,7 +276,8 @@ def test_glossary_create(
262276

263277
finally:
264278
cleanup_matching_glossaries(
265-
lambda glossary: glossary.name in [name_file, name_cli, name_stdin]
279+
lambda glossary: glossary.name
280+
in [name_file, name_cli, name_stdin, name_csv]
266281
)
267282

268283

tests/test_glossary.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,30 @@ def test_glossary_create(
3838
)
3939

4040

41+
def test_glossary_create_csv(
42+
translator,
43+
glossary_name,
44+
cleanup_matching_glossaries,
45+
example_glossary_csv,
46+
example_glossary_csv_entries,
47+
):
48+
source_lang = "EN"
49+
target_lang = "DE"
50+
try:
51+
with open(example_glossary_csv, "r") as csv_data:
52+
glossary = translator.create_glossary_from_csv(
53+
glossary_name, source_lang, target_lang, csv_data=csv_data
54+
)
55+
assert glossary.entry_count == len(example_glossary_csv_entries)
56+
57+
entries = translator.get_glossary_entries(glossary.glossary_id)
58+
assert entries == example_glossary_csv_entries
59+
finally:
60+
cleanup_matching_glossaries(
61+
lambda glossary: glossary.name == glossary_name
62+
)
63+
64+
4165
def test_glossary_create_invalid(
4266
translator, glossary_name, cleanup_matching_glossaries
4367
):

0 commit comments

Comments
 (0)