Skip to content

Commit 90e06b8

Browse files
authored
Merge pull request #1443 from Unidata/issue1440
tentative fixes for issue #1440
2 parents 93aa5f6 + caed197 commit 90e06b8

File tree

5 files changed

+50
-13
lines changed

5 files changed

+50
-13
lines changed

Changelog

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
version 1.7.4 (not yet released)
2+
================================
3+
* Make sure automatic conversion of character arrays <--> string arrays works for Unicode strings (issue #1440).
4+
(previously only worked correctly for encoding="ascii").
5+
16
version 1.7.3 (tag v1.7.3rel)
27
=============================
38
* Python 3.14 wheels (issue #1432)

include/netcdf-compat.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ static inline int nc_get_alignment(int* thresholdp, int* alignmentp) {
6060
#else
6161
#define HAS_NCRCSET 0
6262
static inline int nc_rc_set(const char* key, const char* value) { return NC_EINVAL; }
63-
static inline const char *nc_rc_get(const char* key) { return NC_EINVAL; }
63+
static inline const char *nc_rc_get(const char* key) { return NULL; }
6464
#endif
6565

6666
#if NC_VERSION_GE(4, 4, 0)

src/netCDF4/__init__.pyi

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -699,6 +699,7 @@ def stringtoarr(
699699
def stringtochar(
700700
a: npt.NDArray[np.character],
701701
encoding: Literal["none", "None", "bytes"],
702+
n_strlen: int | None = None,
702703
) -> npt.NDArray[np.bytes_]: ...
703704
@overload
704705
def stringtochar(

src/netCDF4/_netCDF4.pyx

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,7 +1066,7 @@ If the `_Encoding` special attribute is set for a character array
10661066
(dtype `S1`) variable, the `chartostring` utility function is used to convert the array of
10671067
characters to an array of strings with one less dimension (the last dimension is
10681068
interpreted as the length of each string) when reading the data. The character
1069-
set (usually ascii) is specified by the `_Encoding` attribute. If `_Encoding`
1069+
set is specified by the `_Encoding` attribute. If `_Encoding`
10701070
is 'none' or 'bytes', then the character array is converted to a numpy
10711071
fixed-width byte string array (dtype `S#`), otherwise a numpy unicode (dtype
10721072
`U#`) array is created. When writing the data,
@@ -5525,11 +5525,15 @@ cannot be safely cast to variable data type""" % attname
55255525
# if data is a string or a bytes object, convert to a numpy string array
55265526
# whose length is equal to the rightmost dimension of the
55275527
# variable.
5528-
if type(data) in [str,bytes]: data = numpy.asarray(data,dtype='S'+repr(self.shape[-1]))
5528+
if type(data) in [str,bytes]:
5529+
if encoding == 'ascii':
5530+
data = numpy.asarray(data,dtype='S'+repr(self.shape[-1]))
5531+
else:
5532+
data = numpy.asarray(data,dtype='U'+repr(self.shape[-1]))
55295533
if data.dtype.kind in ['S','U'] and data.dtype.itemsize > 1:
55305534
# if data is a numpy string array, convert it to an array
55315535
# of characters with one more dimension.
5532-
data = stringtochar(data, encoding=encoding)
5536+
data = stringtochar(data, encoding=encoding,n_strlen=self.shape[-1])
55335537

55345538
# if structured data has strings (and _Encoding att set), create view as char arrays
55355539
# (issue #773)
@@ -6771,9 +6775,9 @@ returns a rank 1 numpy character array of length NUMCHARS with datatype `'S1'`
67716775
arr[0:len(string)] = tuple(string)
67726776
return arr
67736777

6774-
def stringtochar(a,encoding='utf-8'):
6778+
def stringtochar(a,encoding='utf-8',n_strlen=None):
67756779
"""
6776-
**`stringtochar(a,encoding='utf-8')`**
6780+
**`stringtochar(a,encoding='utf-8',n_strlen=None)`**
67776781
67786782
convert a string array to a character array with one extra dimension
67796783
@@ -6785,16 +6789,29 @@ optional kwarg `encoding` can be used to specify character encoding (default
67856789
`utf-8`). If `encoding` is 'none' or 'bytes', a `numpy.string_` the input array
67866790
is treated a raw byte strings (`numpy.string_`).
67876791
6792+
optional kwarg `n_strlen` is the number of characters in each string. Default
6793+
is None, which means `n_strlen` will be set to a.itemsize (the number of bytes
6794+
used to represent each string in the input array).
6795+
67886796
returns a numpy character array with datatype `'S1'` or `'U1'`
67896797
and shape `a.shape + (N,)`, where N is the length of each string in a."""
67906798
dtype = a.dtype.kind
6799+
if n_strlen is None:
6800+
n_strlen = a.dtype.itemsize
67916801
if dtype not in ["S","U"]:
67926802
raise ValueError("type must string or unicode ('S' or 'U')")
67936803
if encoding in ['none','None','bytes']:
67946804
b = numpy.array(tuple(a.tobytes()),'S1')
6795-
else:
6805+
elif encoding == 'ascii':
67966806
b = numpy.array(tuple(a.tobytes().decode(encoding)),dtype+'1')
6797-
b.shape = a.shape + (a.itemsize,)
6807+
b.shape = a.shape + (n_strlen,)
6808+
else:
6809+
if not a.ndim:
6810+
a = numpy.array([a])
6811+
bbytes = [text.encode(encoding) for text in a]
6812+
pad = b'\0' * n_strlen
6813+
bbytes = [(x + pad)[:n_strlen] for x in bbytes]
6814+
b = numpy.array([[bb[i:i+1] for i in range(n_strlen)] for bb in bbytes])
67986815
return b
67996816

68006817
def chartostring(b,encoding='utf-8'):
@@ -6816,15 +6833,12 @@ returns a numpy string array with datatype `'UN'` (or `'SN'`) and shape
68166833
dtype = b.dtype.kind
68176834
if dtype not in ["S","U"]:
68186835
raise ValueError("type must be string or unicode ('S' or 'U')")
6819-
if encoding in ['none','None','bytes']:
6820-
bs = b.tobytes()
6821-
else:
6822-
bs = b.tobytes().decode(encoding)
6836+
bs = b.tobytes()
68236837
slen = int(b.shape[-1])
68246838
if encoding in ['none','None','bytes']:
68256839
a = numpy.array([bs[n1:n1+slen] for n1 in range(0,len(bs),slen)],'S'+repr(slen))
68266840
else:
6827-
a = numpy.array([bs[n1:n1+slen] for n1 in range(0,len(bs),slen)],'U'+repr(slen))
6841+
a = numpy.array([bs[n1:n1+slen].decode(encoding) for n1 in range(0,len(bs),slen)],'U'+repr(slen))
68286842
a.shape = b.shape[:-1]
68296843
return a
68306844

test/test_stringarr.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import unittest
44
import os
55
from numpy.testing import assert_array_equal, assert_array_almost_equal
6+
import numpy as np
67

78
def generateString(length, alphabet=string.ascii_letters + string.digits + string.punctuation):
89
return(''.join([random.choice(alphabet) for i in range(length)]))
@@ -20,6 +21,11 @@ def generateString(length, alphabet=string.ascii_letters + string.digits + strin
2021
datau = data.astype('U')
2122
datac = stringtochar(data, encoding='ascii')
2223

24+
nx, n_strlen = 3, 12
25+
unicode_strings = np.array(['Münster', 'Liége', '東京'],dtype='U'+str(n_strlen))
26+
unicode_strings2 = np.array(['Münster', 'Москва', '東京'],dtype='U'+str(n_strlen))
27+
unicode_strings2_bytes = [b'M', b'\xc3', b'\xbc', b'n', b's', b't', b'e', b'r', b'\xd0', b'\x9c', b'\xd0', b'\xbe', b'\xd1', b'\x81', b'\xd0', b'\xba', b'\xd0', b'\xb2', b'\xd0', b'\xb0', b'\xe6', b'\x9d', b'\xb1', b'\xe4', b'\xba', b'\xac']
28+
2329
class StringArrayTestCase(unittest.TestCase):
2430

2531
def setUp(self):
@@ -28,6 +34,8 @@ def setUp(self):
2834
nc.createDimension('n1',None)
2935
nc.createDimension('n2',n2)
3036
nc.createDimension('nchar',nchar)
37+
nc.createDimension("x", nx)
38+
nc.createDimension("nstr", n_strlen)
3139
v = nc.createVariable('strings','S1',('n1','n2','nchar'))
3240
v2 = nc.createVariable('strings2','S1',('n1','n2','nchar'))
3341
# if _Encoding set, string array should automatically be converted
@@ -44,6 +52,11 @@ def setUp(self):
4452
v2[-1,-1] = data[-1,-1].tobytes() # write single python string
4553
# _Encoding should be ignored if an array of characters is specified
4654
v3[:] = stringtochar(data, encoding='ascii')
55+
# test unicode strings (issue #1440)
56+
v4 = nc.createVariable("strings4", "S1", dimensions=("x", "nstr",))
57+
v4._Encoding = "UTF-8"
58+
v4[:] = unicode_strings
59+
v4[1] = "Москва"
4760
nc.close()
4861

4962
def tearDown(self):
@@ -57,6 +70,10 @@ def runTest(self):
5770
v = nc.variables['strings']
5871
v2 = nc.variables['strings2']
5972
v3 = nc.variables['strings3']
73+
v4 = nc.variables['strings4']
74+
assert np.all(v4[:]==unicode_strings2)
75+
v4.set_auto_chartostring(False)
76+
assert (v4[:].compressed().tolist() == unicode_strings2_bytes)
6077
assert v.dtype.str[1:] in ['S1','U1']
6178
assert v.shape == (nrecs,n2,nchar)
6279
for nrec in range(nrecs):

0 commit comments

Comments
 (0)