@@ -116,6 +116,78 @@ def _validate_data_input(
116
116
raise GMTInvalidInput ("data must provide x, y, and z columns." )
117
117
118
118
119
+ def _check_encoding (
120
+ argstr : str ,
121
+ ) -> Literal [
122
+ "ascii" ,
123
+ "ISOLatin1+" ,
124
+ "ISO-8859-1" ,
125
+ "ISO-8859-2" ,
126
+ "ISO-8859-3" ,
127
+ "ISO-8859-4" ,
128
+ "ISO-8859-5" ,
129
+ "ISO-8859-6" ,
130
+ "ISO-8859-7" ,
131
+ "ISO-8859-8" ,
132
+ "ISO-8859-9" ,
133
+ "ISO-8859-10" ,
134
+ "ISO-8859-11" ,
135
+ "ISO-8859-13" ,
136
+ "ISO-8859-14" ,
137
+ "ISO-8859-15" ,
138
+ "ISO-8859-16" ,
139
+ ]:
140
+ """
141
+ Check the charset encoding of a string.
142
+
143
+ All characters in the string must be in the same charset encoding, otherwise the
144
+ default ``ISOLatin1+`` encoding is returned. Characters in the Adobe Symbol and
145
+ ZapfDingbats encodings are also checked because they're independent on the choice of
146
+ encodings.
147
+
148
+ Parameters
149
+ ----------
150
+ argstr
151
+ The string to be checked.
152
+
153
+ Returns
154
+ -------
155
+ encoding
156
+ The encoding of the string.
157
+
158
+ Examples
159
+ --------
160
+ >>> _check_encoding("123ABC+-?!") # ASCII characters only
161
+ 'ascii'
162
+ >>> _check_encoding("12AB±β①②") # Characters in ISOLatin1+
163
+ 'ISOLatin1+'
164
+ >>> _check_encoding("12ABāáâãäåβ①②") # Characters in ISO-8859-4
165
+ 'ISO-8859-4'
166
+ >>> _check_encoding("12ABŒā") # Mix characters in ISOLatin1+ (Œ) and ISO-8859-4 (ā)
167
+ 'ISOLatin1+'
168
+ >>> _check_encoding("123AB中文") # Characters not in any charset encoding
169
+ 'ISOLatin1+'
170
+ """
171
+ # Return "ascii" if the string only contains ASCII characters.
172
+ if all (32 <= ord (c ) <= 126 for c in argstr ):
173
+ return "ascii"
174
+ # Loop through all supported encodings and check if all characters in the string
175
+ # are in the charset of the encoding. If all characters are in the charset, return
176
+ # the encoding. The ISOLatin1+ encoding is checked first because it is the default
177
+ # and most common encoding.
178
+ adobe_chars = set (charset ["Symbol" ].values ()) | set (
179
+ charset ["ZapfDingbats" ].values ()
180
+ )
181
+ for encoding in ["ISOLatin1+" ] + [f"ISO-8859-{ i } " for i in range (1 , 17 )]:
182
+ if encoding == "ISO-8859-12" : # ISO-8859-12 was abandoned. Skip it.
183
+ continue
184
+ if all (c in (set (charset [encoding ].values ()) | adobe_chars ) for c in argstr ):
185
+ return encoding # type: ignore[return-value]
186
+ # Return the "ISOLatin1+" encoding if the string contains characters from multiple
187
+ # charset encodings or contains characters that are not in any charset encoding.
188
+ return "ISOLatin1+"
189
+
190
+
119
191
def data_kind (
120
192
data : Any = None , required : bool = True
121
193
) -> Literal ["arg" , "file" , "geojson" , "grid" , "image" , "matrix" , "vectors" ]:
@@ -199,17 +271,41 @@ def data_kind(
199
271
return kind
200
272
201
273
202
- def non_ascii_to_octal (argstr : str ) -> str :
274
+ def non_ascii_to_octal (
275
+ argstr : str ,
276
+ encoding : Literal [
277
+ "ascii" ,
278
+ "ISOLatin1+" ,
279
+ "ISO-8859-1" ,
280
+ "ISO-8859-2" ,
281
+ "ISO-8859-3" ,
282
+ "ISO-8859-4" ,
283
+ "ISO-8859-5" ,
284
+ "ISO-8859-6" ,
285
+ "ISO-8859-7" ,
286
+ "ISO-8859-8" ,
287
+ "ISO-8859-9" ,
288
+ "ISO-8859-10" ,
289
+ "ISO-8859-11" ,
290
+ "ISO-8859-13" ,
291
+ "ISO-8859-14" ,
292
+ "ISO-8859-15" ,
293
+ "ISO-8859-16" ,
294
+ ] = "ISOLatin1+" ,
295
+ ) -> str :
203
296
r"""
204
297
Translate non-ASCII characters to their corresponding octal codes.
205
298
206
- Currently, only characters in the ISOLatin1+ charset and Symbol/ZapfDingbats fonts
207
- are supported.
299
+ Currently, only non-ASCII characters in the Adobe ISOLatin1+, Adobe Symbol, Adobe
300
+ ZapfDingbats, and ISO-8850-x (x can be in 1-11, 13-17) encodings are supported.
301
+ The Adobe Standard encoding is not supported yet.
208
302
209
303
Parameters
210
304
----------
211
305
argstr
212
306
The string to be translated.
307
+ encoding
308
+ The encoding of characters in the string.
213
309
214
310
Returns
215
311
-------
@@ -226,9 +322,11 @@ def non_ascii_to_octal(argstr: str) -> str:
226
322
'@%34%\\041@%%@%34%\\176@%%@%34%\\241@%%@%34%\\376@%%'
227
323
>>> non_ascii_to_octal("ABC ±120° DEF α ♥")
228
324
'ABC \\261120\\260 DEF @~\\141@~ @%34%\\252@%%'
325
+ >>> non_ascii_to_octal("12ABāáâãäåβ①②", encoding="ISO-8859-4")
326
+ '12AB\\340\\341\\342\\343\\344\\345@~\\142@~@%34%\\254@%%@%34%\\255@%%'
229
327
""" # noqa: RUF002
230
- # Return the string if it only contains printable ASCII characters from 32 to 126 .
231
- if all (32 <= ord (c ) <= 126 for c in argstr ):
328
+ # Return the input string if it only contains ASCII characters.
329
+ if encoding == "ascii" or all (32 <= ord (c ) <= 126 for c in argstr ):
232
330
return argstr
233
331
234
332
# Dictionary mapping non-ASCII characters to octal codes
@@ -239,15 +337,15 @@ def non_ascii_to_octal(argstr: str) -> str:
239
337
mapping .update (
240
338
{c : f"@%34%\\ { i :03o} @%%" for i , c in charset ["ZapfDingbats" ].items ()}
241
339
)
242
- # Adobe ISOLatin1+ charset. Put at the end .
243
- mapping .update ({c : f"\\ { i :03o} " for i , c in charset ["ISOLatin1+" ].items ()})
340
+ # ISOLatin1+ or ISO-8859-x charset .
341
+ mapping .update ({c : f"\\ { i :03o} " for i , c in charset [encoding ].items ()})
244
342
245
343
# Remove any printable characters
246
344
mapping = {k : v for k , v in mapping .items () if k not in string .printable }
247
345
return argstr .translate (str .maketrans (mapping ))
248
346
249
347
250
- def build_arg_list (
348
+ def build_arg_list ( # noqa: PLR0912
251
349
kwdict : dict [str , Any ],
252
350
confdict : dict [str , str ] | None = None ,
253
351
infile : str | pathlib .PurePath | Sequence [str | pathlib .PurePath ] | None = None ,
@@ -317,6 +415,10 @@ def build_arg_list(
317
415
... )
318
416
... )
319
417
['f1.txt', 'f2.txt', '-A0', '-B', '--FORMAT_DATE_MAP=o dd', '->out.txt']
418
+ >>> build_arg_list(dict(B="12ABāβ①②"))
419
+ ['-B12AB\\340@~\\142@~@%34%\\254@%%@%34%\\255@%%', '--PS_CHAR_ENCODING=ISO-8859-4']
420
+ >>> build_arg_list(dict(B="12ABāβ①②"), confdict=dict(PS_CHAR_ENCODING="ISO-8859-5"))
421
+ ['-B12AB\\340@~\\142@~@%34%\\254@%%@%34%\\255@%%', '--PS_CHAR_ENCODING=ISO-8859-5']
320
422
>>> print(build_arg_list(dict(R="1/2/3/4", J="X4i", watre=True)))
321
423
Traceback (most recent call last):
322
424
...
@@ -331,11 +433,22 @@ def build_arg_list(
331
433
elif value is True :
332
434
gmt_args .append (f"-{ key } " )
333
435
elif is_nonstr_iter (value ):
334
- gmt_args .extend (non_ascii_to_octal ( f"-{ key } { _value } " ) for _value in value )
436
+ gmt_args .extend (f"-{ key } { _value } " for _value in value )
335
437
else :
336
- gmt_args .append (non_ascii_to_octal (f"-{ key } { value } " ))
438
+ gmt_args .append (f"-{ key } { value } " )
439
+
440
+ # Convert non-ASCII characters (if any) in the arguments to octal codes
441
+ encoding = _check_encoding ("" .join (gmt_args ))
442
+ if encoding != "ascii" :
443
+ gmt_args = [non_ascii_to_octal (arg , encoding = encoding ) for arg in gmt_args ]
337
444
gmt_args = sorted (gmt_args )
338
445
446
+ # Set --PS_CHAR_ENCODING=encoding if necessary
447
+ if encoding not in {"ascii" , "ISOLatin1+" } and not (
448
+ confdict and "PS_CHAR_ENCODING" in confdict
449
+ ):
450
+ gmt_args .append (f"--PS_CHAR_ENCODING={ encoding } " )
451
+
339
452
if confdict :
340
453
gmt_args .extend (f"--{ key } ={ value } " for key , value in confdict .items ())
341
454
0 commit comments