Skip to content

Commit bc7489c

Browse files
committed
add option for removing back slash from input data
1 parent b93d29a commit bc7489c

17 files changed

+504
-77
lines changed

CONTRIBUTING.md

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,6 @@ Instructions:
1010

1111
## Testing
1212

13-
### vcrpy
14-
15-
In addition to [`pytest`](https://docs.pytest.org/), we also use the [`vcrpy`](https://vcrpy.readthedocs.io/) library when writing our tests.
16-
1713
### tox
1814

1915
To run the tests, install the project dependencies in a [virtual environment](https://docs.python.org/3/library/venv.html#module-venv)
@@ -41,6 +37,17 @@ pip install "<package_name>"
4137
pip freeze > requirements.txt
4238
```
4339

40+
### vcrpy
41+
42+
In addition to [`pytest`](https://docs.pytest.org/), we also use the [`vcrpy`](https://vcrpy.readthedocs.io/) library when writing our tests.
43+
44+
If you need to update or regenerate a cassette for a test, i.e. [`tests/cassettes/test_translate_missing_messages_without_sorting.yml`](https://github.com/hypercision/i18ntools/blob/main/tests/cassettes/test_translate_missing_messages_without_sorting.yml), then:
45+
46+
- delete the cassete yml file
47+
- update the `os.environ["TRANSLATOR_API_SUBSCRIPTION_KEY"]` line in the test so it is set to a real API key (but do not commit this change)
48+
- run the tests with `tox`. This will regenerate the cassete yml file
49+
- revert the `os.environ["TRANSLATOR_API_SUBSCRIPTION_KEY"]` line in the test so it is no longer a real API key
50+
4451
### Editable installation
4552

4653
Alternatively, you can perform an [editable installation](https://setuptools.pypa.io/en/latest/userguide/development_mode.html)

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ authors = [
77
{ name = "Tyler VanZanten", email = "tvanzanten@hypercision.com" }
88
]
99
dependencies = [
10+
"configparser",
1011
"requests",
1112
]
1213
license = { file="LICENSE" }

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ chardet==5.2.0
55
charset-normalizer==3.3.2
66
click==8.1.7
77
colorama==0.4.6
8+
configparser==7.0.0
89
distlib==0.3.8
910
exceptiongroup==1.1.1
1011
filelock==3.20.2
@@ -20,6 +21,7 @@ pyproject-api==1.7.1
2021
pytest==8.3.2
2122
PyYAML==6.0.1
2223
requests==2.32.5
24+
six==1.16.0
2325
tomli==2.0.1
2426
tox==4.16.0
2527
typing_extensions==4.12.2

src/i18ntools/parse_i18n_file.py

Lines changed: 92 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,33 @@
11
#!/usr/bin/env python
22
"""Parses an i18n Java properties file and returns the data as a dictionary.
33
4-
The benefit of this method over using configparser is that the whitespace in
4+
If called with remove_backslashes=False, then the whitespace in
55
multiline values is preserved.
66
7+
If called with remove_backslashes=True, then configparser is used
8+
and the whitespace and backslashes in multiline values are removed.
9+
710
Note that this method does not work properly for multiline translations
811
with an "=" character in them.
9-
10-
See related question: https://stackoverflow.com/questions/76047202
1112
"""
1213

1314
import argparse
15+
import configparser
16+
import os
1417
from pathlib import Path
1518

1619

17-
def parse_i18n_file(file_path):
20+
def parse_i18n_file(file_path, remove_backslashes=False):
1821
"""Parses an i18n Java properties file and returns the data as a dictionary.
1922
2023
Note that this method does not work properly for multiline translations
2124
with an "=" character in them.
2225
2326
Keyword arguments:
2427
file_path -- filepath of the i18n Java properties file to parse
28+
remove_backslashes -- when true, the data returned will not have the
29+
backslashes used in multiline values.
30+
Multiline values will be transformed into single line values.
2531
"""
2632
if not Path(file_path).exists():
2733
raise FileNotFoundError(f"File {file_path} does not exist")
@@ -60,6 +66,78 @@ def parse_i18n_file(file_path):
6066
f"It has at least one duplicate key: {duplicate_keys}"
6167
)
6268

69+
if remove_backslashes:
70+
# Now that we've ensure the file has no duplicate properties, return
71+
# the data as a dictionary with multiline values transformed into
72+
# single line values.
73+
return parse_i18n_file_without_backslashes(file_path)
74+
75+
return data
76+
77+
78+
def convert_properties_to_ini(input_path, ini_path):
79+
"""Reads a properties file and writes it as an .ini file with a [DEFAULT] section
80+
header to make it compatible with configparser.
81+
82+
Keyword arguments:
83+
input_path -- filepath of the i18n Java properties file to convert
84+
ini_path -- filepath of the output .ini file
85+
"""
86+
with (
87+
open(input_path, "r", encoding="utf-8") as infile,
88+
open(ini_path, "w", encoding="utf-8") as outfile,
89+
):
90+
# Add a dummy section header
91+
outfile.write("[DEFAULT]\n")
92+
outfile.writelines(infile.readlines())
93+
94+
95+
def merge_multiline_string(multiline_string):
96+
"""Takes a multiline string as input, removes the backslashes at the
97+
end of each line, and returns a single line string.
98+
"""
99+
# Split the string into lines and strip any leading/trailing whitespace
100+
# from each line
101+
lines = multiline_string.splitlines()
102+
# Remove the backslash from the end of each line,
103+
# and join the lines into a single string
104+
merged_string = " ".join(line.rstrip("\\").strip() for line in lines)
105+
return merged_string
106+
107+
108+
def parse_i18n_file_without_backslashes(file_path):
109+
"""Parses an i18n Java properties file and returns the data as a dictionary.
110+
Multiline values will be transformed into single line values with the
111+
backslashes removed.
112+
113+
Note that this method does not work properly for multiline translations
114+
with an "=" character in them.
115+
116+
Keyword arguments:
117+
file_path -- filepath of the i18n Java properties file to parse
118+
"""
119+
if not Path(file_path).exists():
120+
raise FileNotFoundError("File {0} does not exist".format(file_path), file_path)
121+
122+
# Convert the properties file into a temporary .ini file
123+
ini_file_path = "temp_file.ini"
124+
convert_properties_to_ini(file_path, ini_file_path)
125+
126+
# Parse the temporary .ini file
127+
# Use RawConfigParser to avoid any interpolation or automatic conversions
128+
config = configparser.RawConfigParser(empty_lines_in_values=False)
129+
# Override the optionxform method to prevent lowercase conversion of the keys
130+
config.optionxform = str # type: ignore
131+
132+
config.read(ini_file_path, encoding="utf-8")
133+
# Clean up the temporary .ini file
134+
os.remove(ini_file_path)
135+
136+
data = {}
137+
for key, value in config["DEFAULT"].items():
138+
merged_string = merge_multiline_string(value)
139+
data[key] = merged_string
140+
63141
return data
64142

65143

@@ -80,8 +158,17 @@ def main():
80158
"Can be specified as a relative or absolute file path."
81159
),
82160
)
161+
parser.add_argument(
162+
"-r",
163+
"--remove_backslashes",
164+
action="store_true",
165+
help=(
166+
"the data returned will not have the "
167+
"backslashes used in multiline values."
168+
),
169+
)
83170
args = parser.parse_args()
84-
result = parse_i18n_file(args.input_file)
171+
result = parse_i18n_file(args.input_file, args.remove_backslashes)
85172
for key, value in result.items():
86173
print("key", key)
87174
print("value", value)

src/i18ntools/translate.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ def make_api_call(
106106
def translate_file(
107107
input_file_path,
108108
output_lang,
109+
remove_backslashes=False,
109110
output_file_path=None,
110111
input_lang=default_lang,
111112
translator_region=default_region,
@@ -120,7 +121,7 @@ def translate_file(
120121
output_file_path = get_default_filepath(input_file_path, output_lang)
121122

122123
# Parse the input file into a dictionary
123-
input_data = parse_i18n_file(input_file_path)
124+
input_data = parse_i18n_file(input_file_path, remove_backslashes)
124125

125126
# Open the input file in read mode to read its contents
126127
with open(input_file_path, "r", encoding="utf-8") as f:
@@ -214,9 +215,23 @@ def main():
214215
default=default_region,
215216
help="region of the Azure translator resource. Defaults to eastus2",
216217
)
218+
parser.add_argument(
219+
"-rbs",
220+
"--remove_backslashes",
221+
action="store_true",
222+
help=(
223+
"any backslashes from multiline values in the input file "
224+
"will not be included in the text that gets translated."
225+
),
226+
)
217227
args = parser.parse_args()
218228
translate_file(
219-
args.input_file, args.to, args.output_file, args.from_lang, args.region
229+
args.input_file,
230+
args.to,
231+
args.remove_backslashes,
232+
args.output_file,
233+
args.from_lang,
234+
args.region,
220235
)
221236

222237

src/i18ntools/translate_missing.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def translate_missing_messages(
2727
input_file_path,
2828
output_lang,
2929
sort_file=False,
30+
remove_backslashes=False,
3031
output_file_path=None,
3132
input_lang=default_lang,
3233
translator_region=default_region,
@@ -46,7 +47,7 @@ def translate_missing_messages(
4647
raise FileNotFoundError(f"File {output_file_path} does not exist")
4748

4849
# Parse the input file and output file into a dictionary
49-
input_data = parse_i18n_file(input_file_path)
50+
input_data = parse_i18n_file(input_file_path, remove_backslashes)
5051
output_data = parse_i18n_file(output_file_path)
5152

5253
# Find any i18n messages missing from the output file
@@ -148,6 +149,15 @@ def main():
148149
default=default_region,
149150
help="region of the Azure translator resource. Defaults to eastus2",
150151
)
152+
parser.add_argument(
153+
"-rbs",
154+
"--remove_backslashes",
155+
action="store_true",
156+
help=(
157+
"any backslashes from multiline values in the input file "
158+
"will not be included in the text that gets translated."
159+
),
160+
)
151161
parser.add_argument(
152162
"-s",
153163
"--sort",
@@ -163,6 +173,7 @@ def main():
163173
args.input_file,
164174
args.to,
165175
args.sort,
176+
args.remove_backslashes,
166177
args.output_file,
167178
args.from_lang,
168179
args.region,
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
interactions:
2+
- request:
3+
body: '[{"text": "Property [{0}] of class [{1}] with value [{2}] is less than
4+
minimum value [{3}]"}, {"text": "I want to see you knocking at the door. I wanna
5+
leave you out there waiting in the downpour. Singing that you\u2019re sorry,
6+
dripping on the hall floor."}, {"text": "The customSubmitTS parameter is missing.
7+
It must be present and of type Date."}, {"text": "{0} session removed."}, {"text":
8+
"The trial period has ended for your account and you can no longer use the application."},
9+
{"text": "Instructor is disabled"}, {"text": " Attendance actions made on this
10+
page will also be made for every session in this group."}, {"text": "Errors:
11+
{0}. \\n\\n Sessions successfully removed: {1}"}]'
12+
headers:
13+
Accept:
14+
- '*/*'
15+
Accept-Encoding:
16+
- gzip, deflate
17+
Connection:
18+
- keep-alive
19+
Content-Length:
20+
- '690'
21+
Content-Type:
22+
- application/json
23+
Ocp-Apim-Subscription-Region:
24+
- eastus2
25+
User-Agent:
26+
- python-requests/2.32.5
27+
method: POST
28+
uri: https://api.cognitive.microsofttranslator.com/translate?api-version=3.0&from=en&to=de
29+
response:
30+
body:
31+
string: "[{\"translations\":[{\"text\":\"Die Eigenschaft [{0}] der Klasse [{1}]
32+
mit Wert [{2}] ist kleiner als der Mindestwert [{3}]\",\"to\":\"de\"}]},{\"translations\":[{\"text\":\"Ich
33+
will dich an der T\xFCr klopfen sehen. Ich will dich drau\xDFen im Wolkenbruch
34+
warten lassen. Singend, dass es dir leid tut, tropfend auf den Flurboden.\",\"to\":\"de\"}]},{\"translations\":[{\"text\":\"Der
35+
customSubmitTS-Parameter fehlt. Es muss anwesend und vom Typ Datum sein.\",\"to\":\"de\"}]},{\"translations\":[{\"text\":\"{0}
36+
Sitzung entfernt.\",\"to\":\"de\"}]},{\"translations\":[{\"text\":\"Die Testphase
37+
f\xFCr dein Konto ist beendet und du kannst die Anwendung nicht mehr nutzen.\",\"to\":\"de\"}]},{\"translations\":[{\"text\":\"Der
38+
Ausbilder ist behindert\",\"to\":\"de\"}]},{\"translations\":[{\"text\":\"
39+
F\xFCr jede Sitzung in dieser Gruppe werden auch die auf dieser Seite vorgenommenen
40+
Anwesenheitsma\xDFnahmen vorgenommen.\",\"to\":\"de\"}]},{\"translations\":[{\"text\":\"Fehler:
41+
{0}. \\\\n\\\\n Sitzungen erfolgreich entfernt: {1}\",\"to\":\"de\"}]}]"
42+
headers:
43+
Connection:
44+
- keep-alive
45+
Content-Type:
46+
- application/json; charset=utf-8
47+
Date:
48+
- Wed, 14 Jan 2026 20:16:28 GMT
49+
Strict-Transport-Security:
50+
- max-age=31536000; includeSubDomains
51+
Transfer-Encoding:
52+
- chunked
53+
access-control-expose-headers:
54+
- X-RequestId,X-Metered-Usage,X-MT-System
55+
x-content-type-options:
56+
- nosniff
57+
x-envoy-upstream-service-time:
58+
- '438'
59+
x-metered-usage:
60+
- '571'
61+
x-mt-system:
62+
- Microsoft
63+
x-requestid:
64+
- 5e1ef095-341f-4bca-b80b-b07d6089bf28.EUWE.0114T2016
65+
status:
66+
code: 200
67+
message: OK
68+
version: 1

0 commit comments

Comments
 (0)