@@ -83,6 +83,15 @@ def _is_numeric_array(value) -> bool:
8383 return arr .dtype .kind in {"i" , "f" , "u" , "c" }
8484 return False
8585
86+ @staticmethod
87+ def _is_string_array (value ) -> bool :
88+ """Return True if the value is a string array (list of strings or numpy unicode array)."""
89+ if isinstance (value , np .ndarray ):
90+ return value .dtype .kind in {"U" , "S" }
91+ if isinstance (value , list ) and value and all (isinstance (v , str ) for v in value ):
92+ return True
93+ return False
94+
8695 @classmethod
8796 def get_queryable_properties (cls , include_internal : bool = False ) -> dict :
8897 """
@@ -272,7 +281,7 @@ def detect_storage_backend(cls, prop_name: str) -> str:
272281 store_in = json_schema_extra .get (cls ._storage_metadata_key , '' ).lower ()
273282 else :
274283 store_in = 'db' # default to db for unknown properties
275-
284+
276285 return store_in
277286
278287 def _store_properties (self ):
@@ -309,6 +318,23 @@ def _store_properties(self):
309318 repository_dict [prop_name ] = arr
310319 if self ._store_shape_metadata :
311320 database_dict [f'shape|{ prop_name } ' ] = list (arr .shape )
321+ elif target == "repository" and self ._is_string_array (value ):
322+ arr = np .asarray (value , dtype = str )
323+ repository_dict [prop_name ] = arr
324+ if self ._store_shape_metadata :
325+ database_dict [f'shape|{ prop_name } ' ] = list (arr .shape )
326+ elif prop_name == 'site_indices' :
327+ # site_indices is a ragged list-of-lists (one per kind, variable length because of different number of sites per kind).
328+ # Encode as two flat 1D int arrays using CSR format so the npz stays
329+ # homogeneous (allow_pickle=False compatible):
330+ # site_indices_flat : all indices concatenated, shape (total_sites,)
331+ # site_indices_offsets : cumulative start positions, shape (n_kinds + 1,)
332+ # e.g. [[0,1],[2],[3,4,5]] → flat=[0,1,2,3,4,5], offsets=[0,2,3,6]
333+ flat = np .array ([idx for sublist in value for idx in sublist ], dtype = np .int64 )
334+ lengths = np .array ([len (sublist ) for sublist in value ], dtype = np .int64 )
335+ offsets = np .concatenate ([[0 ], np .cumsum (lengths )]).astype (np .int64 )
336+ repository_dict ['site_indices_flat' ] = flat
337+ repository_dict ['site_indices_offsets' ] = offsets
312338 else :
313339 database_dict [prop_name ] = value
314340
@@ -343,7 +369,23 @@ def _load_properties_from_npz(self) -> dict:
343369 with self .base .repository .open (self ._properties_filename , mode = 'rb' ) as handle :
344370 npz_data = np .load (handle , allow_pickle = False )
345371 # Convert to regular dict (npz returns NpzFile object)
346- properties = {key : npz_data [key ] for key in npz_data .files }
372+ # String arrays (dtype 'U' or 'S') are converted back to Python lists
373+ properties = {}
374+ for key in npz_data .files :
375+ arr = npz_data [key ]
376+ if arr .dtype .kind in {"U" , "S" }:
377+ properties [key ] = arr .tolist ()
378+ else :
379+ properties [key ] = arr
380+
381+ # Decode CSR-encoded site_indices back into list-of-lists
382+ if 'site_indices_flat' in properties and 'site_indices_offsets' in properties :
383+ flat = properties .pop ('site_indices_flat' )
384+ offsets = properties .pop ('site_indices_offsets' )
385+ properties ['site_indices' ] = [
386+ flat [offsets [i ]:offsets [i + 1 ]].tolist ()
387+ for i in range (len (offsets ) - 1 )
388+ ]
347389
348390 # Cache if stored
349391 if self .is_stored :
0 commit comments