Merge pull request #1443 from Unidata/issue1440

jswhit · web-flow · commit 90e06b802543 · 2025-10-28T13:48:12.000-06:00
tentative fixes for issue #1440
diff --git a/Changelog b/Changelog
@@ -1,3 +1,8 @@
+ version 1.7.4 (not yet released)
+ ================================
+ * Make sure automatic conversion of character arrays <--> string arrays works for Unicode strings (issue #1440).
+   (previously only worked correctly for encoding="ascii").
+
  version 1.7.3 (tag v1.7.3rel)
  =============================
  * Python 3.14 wheels (issue #1432)
diff --git a/include/netcdf-compat.h b/include/netcdf-compat.h
@@ -60,7 +60,7 @@ static inline int nc_get_alignment(int* thresholdp, int* alignmentp) {
 #else
 #define HAS_NCRCSET 0
 static inline int nc_rc_set(const char* key, const char* value) { return NC_EINVAL; }
-static inline const char *nc_rc_get(const char* key) { return NC_EINVAL; }
+static inline const char *nc_rc_get(const char* key) { return NULL; }
 #endif
 
 #if NC_VERSION_GE(4, 4, 0)
diff --git a/src/netCDF4/__init__.pyi b/src/netCDF4/__init__.pyi
@@ -699,6 +699,7 @@ def stringtoarr(
 def stringtochar(
     a: npt.NDArray[np.character],
     encoding: Literal["none", "None", "bytes"],
+    n_strlen: int | None = None,
 ) -> npt.NDArray[np.bytes_]: ...
 @overload
 def stringtochar(
diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx
@@ -1066,7 +1066,7 @@ If the `_Encoding` special attribute is set for a character array
 (dtype `S1`) variable, the `chartostring` utility function is used to convert the array of
 characters to an array of strings with one less dimension (the last dimension is
 interpreted as the length of each string) when reading the data. The character
-set (usually ascii) is specified by the `_Encoding` attribute. If `_Encoding`
+set is specified by the `_Encoding` attribute. If `_Encoding`
 is 'none' or 'bytes', then the character array is converted to a numpy
 fixed-width byte string array (dtype `S#`), otherwise a numpy unicode (dtype
 `U#`) array is created.  When writing the data,
@@ -5525,11 +5525,15 @@ cannot be safely cast to variable data type""" % attname
                 # if data is a string or a bytes object, convert to a numpy string array
                 # whose length is equal to the rightmost dimension of the
                 # variable.
-                if type(data) in [str,bytes]: data = numpy.asarray(data,dtype='S'+repr(self.shape[-1]))
+                if type(data) in [str,bytes]:
+                    if encoding == 'ascii':
+                        data = numpy.asarray(data,dtype='S'+repr(self.shape[-1]))
+                    else:
+                        data = numpy.asarray(data,dtype='U'+repr(self.shape[-1]))
                 if data.dtype.kind in ['S','U'] and data.dtype.itemsize > 1:
                     # if data is a numpy string array, convert it to an array
                     # of characters with one more dimension.
-                    data = stringtochar(data, encoding=encoding)
+                    data = stringtochar(data, encoding=encoding,n_strlen=self.shape[-1])
 
         # if structured data has strings (and _Encoding att set), create view as char arrays
         # (issue #773)
@@ -6771,9 +6775,9 @@ returns a rank 1 numpy character array of length NUMCHARS with datatype `'S1'`
     arr[0:len(string)] = tuple(string)
     return arr
 
-def stringtochar(a,encoding='utf-8'):
+def stringtochar(a,encoding='utf-8',n_strlen=None):
     """
-**`stringtochar(a,encoding='utf-8')`**
+**`stringtochar(a,encoding='utf-8',n_strlen=None)`**
 
 convert a string array to a character array with one extra dimension
 
@@ -6785,16 +6789,29 @@ optional kwarg `encoding` can be used to specify character encoding (default
 `utf-8`). If `encoding` is 'none' or 'bytes', a `numpy.string_` the input array
 is treated a raw byte strings (`numpy.string_`).
 
+optional kwarg `n_strlen` is the number of characters in each string.  Default
+is None, which means `n_strlen` will be set to a.itemsize (the number of bytes
+used to represent each string in the input array).
+
 returns a numpy character array with datatype `'S1'` or `'U1'`
 and shape `a.shape + (N,)`, where N is the length of each string in a."""
     dtype = a.dtype.kind
+    if n_strlen is None:
+        n_strlen = a.dtype.itemsize
     if dtype not in ["S","U"]:
         raise ValueError("type must string or unicode ('S' or 'U')")
     if encoding in ['none','None','bytes']:
         b = numpy.array(tuple(a.tobytes()),'S1')
-    else:
+    elif encoding == 'ascii':
         b = numpy.array(tuple(a.tobytes().decode(encoding)),dtype+'1')
-    b.shape = a.shape + (a.itemsize,)
+        b.shape = a.shape + (n_strlen,)
+    else:
+        if not a.ndim:
+            a = numpy.array([a])
+        bbytes = [text.encode(encoding) for text in a]
+        pad = b'\0' * n_strlen
+        bbytes = [(x + pad)[:n_strlen] for x in bbytes]
+        b = numpy.array([[bb[i:i+1] for i in range(n_strlen)] for bb in bbytes])
     return b
 
 def chartostring(b,encoding='utf-8'):
@@ -6816,15 +6833,12 @@ returns a numpy string array with datatype `'UN'` (or `'SN'`) and shape
     dtype = b.dtype.kind
     if dtype not in ["S","U"]:
         raise ValueError("type must be string or unicode ('S' or 'U')")
-    if encoding in ['none','None','bytes']:
-        bs = b.tobytes()
-    else:
-        bs = b.tobytes().decode(encoding)
+    bs = b.tobytes()
     slen = int(b.shape[-1])
     if encoding in ['none','None','bytes']:
         a = numpy.array([bs[n1:n1+slen] for n1 in range(0,len(bs),slen)],'S'+repr(slen))
     else:
-        a = numpy.array([bs[n1:n1+slen] for n1 in range(0,len(bs),slen)],'U'+repr(slen))
+        a = numpy.array([bs[n1:n1+slen].decode(encoding) for n1 in range(0,len(bs),slen)],'U'+repr(slen))
     a.shape = b.shape[:-1]
     return a
 
diff --git a/test/test_stringarr.py b/test/test_stringarr.py
@@ -3,6 +3,7 @@
 import unittest
 import os
 from numpy.testing import assert_array_equal, assert_array_almost_equal
+import numpy as np
 
 def generateString(length, alphabet=string.ascii_letters + string.digits + string.punctuation):
     return(''.join([random.choice(alphabet) for i in range(length)]))
@@ -20,6 +21,11 @@ def generateString(length, alphabet=string.ascii_letters + string.digits + strin
 datau = data.astype('U')
 datac = stringtochar(data, encoding='ascii')
 
+nx, n_strlen = 3, 12
+unicode_strings = np.array(['Münster', 'Liége', '東京'],dtype='U'+str(n_strlen))
+unicode_strings2 = np.array(['Münster', 'Москва', '東京'],dtype='U'+str(n_strlen))
+unicode_strings2_bytes = [b'M', b'\xc3', b'\xbc', b'n', b's', b't', b'e', b'r', b'\xd0', b'\x9c', b'\xd0', b'\xbe', b'\xd1', b'\x81', b'\xd0', b'\xba', b'\xd0', b'\xb2', b'\xd0', b'\xb0', b'\xe6', b'\x9d', b'\xb1', b'\xe4', b'\xba', b'\xac']
+
 class StringArrayTestCase(unittest.TestCase):
 
     def setUp(self):
@@ -28,6 +34,8 @@ def setUp(self):
         nc.createDimension('n1',None)
         nc.createDimension('n2',n2)
         nc.createDimension('nchar',nchar)
+        nc.createDimension("x", nx)
+        nc.createDimension("nstr", n_strlen)
         v = nc.createVariable('strings','S1',('n1','n2','nchar'))
         v2 = nc.createVariable('strings2','S1',('n1','n2','nchar'))
         # if _Encoding set, string array should automatically be converted
@@ -44,6 +52,11 @@ def setUp(self):
         v2[-1,-1] = data[-1,-1].tobytes() # write single python string
         # _Encoding should be ignored if an array of characters is specified
         v3[:] = stringtochar(data, encoding='ascii')
+        # test unicode strings (issue #1440)
+        v4 = nc.createVariable("strings4", "S1", dimensions=("x", "nstr",))
+        v4._Encoding = "UTF-8"
+        v4[:] = unicode_strings
+        v4[1] = "Москва"
         nc.close()
 
     def tearDown(self):
@@ -57,6 +70,10 @@ def runTest(self):
         v = nc.variables['strings']
         v2 = nc.variables['strings2']
         v3 = nc.variables['strings3']
+        v4 = nc.variables['strings4']
+        assert np.all(v4[:]==unicode_strings2)
+        v4.set_auto_chartostring(False)
+        assert (v4[:].compressed().tolist() == unicode_strings2_bytes)
         assert v.dtype.str[1:] in ['S1','U1']
         assert v.shape == (nrecs,n2,nchar)
         for nrec in range(nrecs):