Skip to content

Commit 23c8b01

Browse files
committed
WIP: add option for removing back slash from input data
1 parent b93d29a commit 23c8b01

10 files changed

+288
-8
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ authors = [
77
{ name = "Tyler VanZanten", email = "tvanzanten@hypercision.com" }
88
]
99
dependencies = [
10+
"configparser",
1011
"requests",
1112
]
1213
license = { file="LICENSE" }

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ chardet==5.2.0
55
charset-normalizer==3.3.2
66
click==8.1.7
77
colorama==0.4.6
8+
configparser==7.0.0
89
distlib==0.3.8
910
exceptiongroup==1.1.1
1011
filelock==3.20.2
@@ -20,6 +21,7 @@ pyproject-api==1.7.1
2021
pytest==8.3.2
2122
PyYAML==6.0.1
2223
requests==2.32.5
24+
six==1.16.0
2325
tomli==2.0.1
2426
tox==4.16.0
2527
typing_extensions==4.12.2

src/i18ntools/parse_i18n_file.py

Lines changed: 91 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,33 @@
11
#!/usr/bin/env python
22
"""Parses an i18n Java properties file and returns the data as a dictionary.
33
4-
The benefit of this method over using configparser is that the whitespace in
4+
If called with remove_backslashes=False, then the whitespace in
55
multiline values is preserved.
66
7+
If called with remove_backslashes=True, then configparser is used
8+
and the whitespace and backslashes in multiline values are removed.
9+
710
Note that this method does not work properly for multiline translations
811
with an "=" character in them.
9-
10-
See related question: https://stackoverflow.com/questions/76047202
1112
"""
1213

1314
import argparse
15+
import configparser
16+
import os
1417
from pathlib import Path
1518

1619

17-
def parse_i18n_file(file_path):
20+
def parse_i18n_file(file_path, remove_backslashes=False):
1821
"""Parses an i18n Java properties file and returns the data as a dictionary.
1922
2023
Note that this method does not work properly for multiline translations
2124
with an "=" character in them.
2225
2326
Keyword arguments:
2427
file_path -- filepath of the i18n Java properties file to parse
28+
remove_backslashes -- when true, the data returned will not have the
29+
backslashes used in multiline values.
30+
Multiline values will be transformed into single line values.
2531
"""
2632
if not Path(file_path).exists():
2733
raise FileNotFoundError(f"File {file_path} does not exist")
@@ -60,6 +66,77 @@ def parse_i18n_file(file_path):
6066
f"It has at least one duplicate key: {duplicate_keys}"
6167
)
6268

69+
if remove_backslashes:
70+
# Now that we've ensure the file has no duplicate properties, return
71+
# the data as a dictionary with multiline values transformed into
72+
# single line values.
73+
return parse_i18n_file_without_backslashes(file_path)
74+
75+
return data
76+
77+
78+
def convert_properties_to_ini(input_path, ini_path):
79+
"""Reads a properties file and writes it as an .ini file with a [DEFAULT] section
80+
header to make it compatible with configparser.
81+
82+
Keyword arguments:
83+
input_path -- filepath of the i18n Java properties file to convert
84+
ini_path -- filepath of the output .ini file
85+
"""
86+
with open(input_path, "r", encoding="utf-8") as infile, open(
87+
ini_path, "w", encoding="utf-8"
88+
) as outfile:
89+
# Add a dummy section header
90+
outfile.write("[DEFAULT]\n")
91+
outfile.writelines(infile.readlines())
92+
93+
94+
def merge_multiline_string(multiline_string):
95+
"""Takes a multiline string as input, removes the backslashes at the
96+
end of each line, and returns a single line string.
97+
"""
98+
# Split the string into lines and strip any leading/trailing whitespace
99+
# from each line
100+
lines = multiline_string.splitlines()
101+
# Remove the backslash from the end of each line,
102+
# and join the lines into a single string
103+
merged_string = " ".join(line.rstrip("\\").strip() for line in lines)
104+
return merged_string
105+
106+
107+
def parse_i18n_file_without_backslashes(file_path):
108+
"""Parses an i18n Java properties file and returns the data as a dictionary.
109+
Multiline values will be transformed into single line values with the
110+
backslashes removed.
111+
112+
Note that this method does not work properly for multiline translations
113+
with an "=" character in them.
114+
115+
Keyword arguments:
116+
file_path -- filepath of the i18n Java properties file to parse
117+
"""
118+
if not Path(file_path).exists():
119+
raise FileNotFoundError("File {0} does not exist".format(file_path), file_path)
120+
121+
# Convert the properties file into a temporary .ini file
122+
ini_file_path = "temp_file.ini"
123+
convert_properties_to_ini(file_path, ini_file_path)
124+
125+
# Parse the temporary .ini file
126+
# Use RawConfigParser to avoid any interpolation or automatic conversions
127+
config = configparser.RawConfigParser(empty_lines_in_values=False)
128+
# Override the optionxform method to prevent lowercase conversion of the keys
129+
config.optionxform = str # type: ignore
130+
131+
config.read(ini_file_path, encoding="utf-8")
132+
# Clean up the temporary .ini file
133+
os.remove(ini_file_path)
134+
135+
data = {}
136+
for key, value in config["DEFAULT"].items():
137+
merged_string = merge_multiline_string(value)
138+
data[key] = merged_string
139+
63140
return data
64141

65142

@@ -80,8 +157,17 @@ def main():
80157
"Can be specified as a relative or absolute file path."
81158
),
82159
)
160+
parser.add_argument(
161+
"-r",
162+
"--remove_backslashes",
163+
action="store_true",
164+
help=(
165+
"the data returned will not have the "
166+
"backslashes used in multiline values."
167+
),
168+
)
83169
args = parser.parse_args()
84-
result = parse_i18n_file(args.input_file)
170+
result = parse_i18n_file(args.input_file, args.remove_backslashes)
85171
for key, value in result.items():
86172
print("key", key)
87173
print("value", value)

src/i18ntools/translate.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ def make_api_call(
106106
def translate_file(
107107
input_file_path,
108108
output_lang,
109+
remove_backslashes=False,
109110
output_file_path=None,
110111
input_lang=default_lang,
111112
translator_region=default_region,
@@ -120,7 +121,7 @@ def translate_file(
120121
output_file_path = get_default_filepath(input_file_path, output_lang)
121122

122123
# Parse the input file into a dictionary
123-
input_data = parse_i18n_file(input_file_path)
124+
input_data = parse_i18n_file(input_file_path, remove_backslashes)
124125

125126
# Open the input file in read mode to read its contents
126127
with open(input_file_path, "r", encoding="utf-8") as f:
@@ -214,9 +215,23 @@ def main():
214215
default=default_region,
215216
help="region of the Azure translator resource. Defaults to eastus2",
216217
)
218+
parser.add_argument(
219+
"-rbs",
220+
"--remove_backslashes",
221+
action="store_true",
222+
help=(
223+
"any backslashes from multiline values in the input file "
224+
"will not be included in the text that gets translated."
225+
),
226+
)
217227
args = parser.parse_args()
218228
translate_file(
219-
args.input_file, args.to, args.output_file, args.from_lang, args.region
229+
args.input_file,
230+
args.to,
231+
args.remove_backslashes,
232+
args.output_file,
233+
args.from_lang,
234+
args.region,
220235
)
221236

222237

src/i18ntools/translate_missing.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ def translate_missing_messages(
2727
input_file_path,
2828
output_lang,
2929
sort_file=False,
30+
remove_backslashes=False,
3031
output_file_path=None,
3132
input_lang=default_lang,
3233
translator_region=default_region,
@@ -46,7 +47,7 @@ def translate_missing_messages(
4647
raise FileNotFoundError(f"File {output_file_path} does not exist")
4748

4849
# Parse the input file and output file into a dictionary
49-
input_data = parse_i18n_file(input_file_path)
50+
input_data = parse_i18n_file(input_file_path, remove_backslashes)
5051
output_data = parse_i18n_file(output_file_path)
5152

5253
# Find any i18n messages missing from the output file
@@ -148,6 +149,15 @@ def main():
148149
default=default_region,
149150
help="region of the Azure translator resource. Defaults to eastus2",
150151
)
152+
parser.add_argument(
153+
"-rbs",
154+
"--remove_backslashes",
155+
action="store_true",
156+
help=(
157+
"any backslashes from multiline values in the input file "
158+
"will not be included in the text that gets translated."
159+
),
160+
)
151161
parser.add_argument(
152162
"-s",
153163
"--sort",
@@ -163,6 +173,7 @@ def main():
163173
args.input_file,
164174
args.to,
165175
args.sort,
176+
args.remove_backslashes,
166177
args.output_file,
167178
args.from_lang,
168179
args.region,
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
interactions:
2+
- request:
3+
body: '[{"text": "I want to see you knocking at the door. I wanna leave you out
4+
there waiting in the downpour. Singing that you\u2019re sorry, dripping on the
5+
hall floor."}, {"text": "The customSubmitTS parameter is missing. It must be
6+
present and of type Date."}, {"text": "{0} session removed."}, {"text": "The
7+
trial period has ended for your account and you can no longer use the application."},
8+
{"text": "Instructor is disabled"}]'
9+
headers:
10+
Accept:
11+
- '*/*'
12+
Accept-Encoding:
13+
- gzip, deflate
14+
Connection:
15+
- keep-alive
16+
Content-Length:
17+
- '427'
18+
Content-Type:
19+
- application/json
20+
Ocp-Apim-Subscription-Region:
21+
- eastus2
22+
User-Agent:
23+
- python-requests/2.32.3
24+
method: POST
25+
uri: https://api.cognitive.microsofttranslator.com/translate?api-version=3.0&from=en&to=de
26+
response:
27+
body:
28+
string: "[{\"translations\":[{\"text\":\"Ich m\xF6chte dich an die T\xFCr klopfen
29+
sehen. Ich m\xF6chte dich da drau\xDFen im Regenguss warten lassen. Singen,
30+
dass es dir leid tut, tropfend auf den Flurboden.\",\"to\":\"de\"}]},{\"translations\":[{\"text\":\"Der
31+
Parameter customSubmitTS fehlt. Es muss vorhanden sein und vom Typ Datum sein.\",\"to\":\"de\"}]},{\"translations\":[{\"text\":\"{0}
32+
Sitzung entfernt.\",\"to\":\"de\"}]},{\"translations\":[{\"text\":\"Die Testphase
33+
f\xFCr Ihr Konto ist abgelaufen und Sie k\xF6nnen die Anwendung nicht mehr
34+
verwenden.\",\"to\":\"de\"}]},{\"translations\":[{\"text\":\"Der Instruktor
35+
ist deaktiviert\",\"to\":\"de\"}]}]"
36+
headers:
37+
Connection:
38+
- keep-alive
39+
Content-Type:
40+
- application/json; charset=utf-8
41+
Date:
42+
- Wed, 19 Feb 2025 21:21:30 GMT
43+
Strict-Transport-Security:
44+
- max-age=31536000; includeSubDomains
45+
Transfer-Encoding:
46+
- chunked
47+
access-control-expose-headers:
48+
- X-RequestId,X-Metered-Usage,X-MT-System
49+
x-content-type-options:
50+
- nosniff
51+
x-envoy-upstream-service-time:
52+
- '5'
53+
x-metered-usage:
54+
- '352'
55+
x-mt-system:
56+
- Microsoft
57+
x-requestid:
58+
- b8714aec-369c-4615-8a1e-5b34c438e6eb.USE2.0219T2121
59+
status:
60+
code: 200
61+
message: OK
62+
version: 1
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
default.invalid.min.message=Eigenschaft [{0}] der Klasse [{1}] mit dem Wert [{2}] ist kleiner als der Mindestwert [{3}]
2+
3+
# Track 4 on Expert In A Dying Field
4+
TheBeths.YourSide.lyrics=Ich möchte dich an die Tür klopfen sehen. Ich möchte dich da draußen im Regenguss warten lassen. Singen, dass es dir leid tut, tropfend auf den Flurboden.
5+
instructor.submitWithCustomTime.customSubmitTS.missing.error=Der Parameter customSubmitTS fehlt. Es muss vorhanden sein und vom Typ Datum sein.
6+
7+
# SessionItem.itemID is the first parameter
8+
instructorService.removeSession.success={0} Sitzung entfernt.
9+
10+
handshake.register.suspended.error=Die Testphase für Ihr Konto ist abgelaufen und Sie können die Anwendung nicht mehr verwenden.
11+
handshake.register.disabledException.error=Der Instruktor ist deaktiviert

tests/test_parse_i18n_file.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,58 @@ def test_parse_file_2():
4949
"\n \\n\\"
5050
"\n \\ Email: tina.herring@yourcompany.com"
5151
)
52+
# Now test parsing the file when removing backslashes
53+
parsed_data = parse_i18n_file(
54+
"tests/resources/example2.properties", remove_backslashes=True
55+
)
56+
assert len(parsed_data.keys()) == 4
57+
assert parsed_data["handshake.register.mobileDeviceLimitReached.error"] == (
58+
"The device limit of {0} devices has been reached "
59+
"for your company''s account and new devices cannot use the application. "
60+
"Please contact your administrator."
61+
)
62+
assert parsed_data["recordAttendance.segment.notFound.error"] == (
63+
"Segment with segmentID {0} not found in the SessionItem's segments"
64+
)
65+
assert parsed_data["me"] == "first!"
66+
assert parsed_data["clientHelpText"] == (
67+
"For issues with the software, tablets, and card scanners "
68+
"only, please contact Tina Herring at \\n \\n "
69+
"\\ Email: tina.herring@yourcompany.com"
70+
)
71+
72+
73+
def test_parse_file_and_remove_slashes():
74+
parsed_data = parse_i18n_file(
75+
"tests/resources/example.properties", remove_backslashes=True
76+
)
77+
assert len(parsed_data.keys()) == 6
78+
assert "instructorService.removeSession.success" in parsed_data
79+
assert (
80+
parsed_data["instructorService.removeSession.success"] == "{0} session removed."
81+
)
82+
assert parsed_data["default.invalid.min.message"] == (
83+
"Property [{0}] of class [{1}] with value "
84+
"[{2}] is less than minimum value [{3}]"
85+
)
86+
87+
assert parsed_data[
88+
"instructor.submitWithCustomTime.customSubmitTS.missing.error"
89+
] == (
90+
"The customSubmitTS parameter is missing. "
91+
"It must be present and of type Date."
92+
)
93+
94+
assert parsed_data["TheBeths.YourSide.lyrics"] == (
95+
"I want to see you knocking at the door. "
96+
"I wanna leave you out there waiting in the downpour. "
97+
"Singing that you’re sorry, dripping on the hall floor."
98+
)
5299

53100

54101
def test_parse_file_with_duplicate_keys():
55102
"""SyntaxWarning is raised for files with duplicate keys"""
56103
with pytest.raises(SyntaxWarning):
57104
parse_i18n_file("tests/resources/duplicate.properties")
105+
with pytest.raises(SyntaxWarning):
106+
parse_i18n_file("tests/resources/duplicate.properties", remove_backslashes=True)

tests/test_translate.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,3 +90,7 @@ def test_make_api_call(fake_german_translations):
9090
input_data = parse_i18n_file("tests/resources/example.properties")
9191
translation_info = make_api_call(input_data, "de")
9292
assert translation_info == fake_german_translations
93+
94+
95+
# TODO: add a test case where translate_file is called
96+
# with remove_backslashes=True

0 commit comments

Comments
 (0)