@@ -1170,8 +1170,17 @@ def put(
1170
1170
complevel : int, 0-9, default None
1171
1171
Specifies a compression level for data.
1172
1172
A value of 0 or None disables compression.
1173
- min_itemsize : int, dict, or None
1174
- Dict of columns that specify minimum str sizes.
1173
+ min_itemsize : int, dict of str: int, or None, default None
1174
+ Minimum size in bytes for string columns. This parameter is only used when
1175
+ format='table'. Can be:
1176
+ - int: Apply the same minimum size to all string columns
1177
+ - dict: Map column names to their minimum sizes
1178
+ - None: Use default sizing
1179
+ **Important**: The size refers to the number of bytes after encoding, not
1180
+ the number of characters. For multi-byte characters (e.g., Chinese, Arabic),
1181
+ you need to account for the encoding. For example, the character '香' is
1182
+ 1 character but 3 bytes when encoded as UTF-8
1183
+ See examples below for proper usage with encoded strings.
1175
1184
nan_rep : str
1176
1185
Str to use as str nan representation.
1177
1186
data_columns : list of columns or True, default None
@@ -1203,6 +1212,23 @@ def put(
1203
1212
>>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
1204
1213
>>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP
1205
1214
>>> store.put("data", df) # doctest: +SKIP
1215
+
1216
+ Basic usage with ASCII strings:
1217
+ >>> df = pd.DataFrame([['hello', 'world']], columns=['A', 'B'])
1218
+ >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1219
+ >>> store.put('data', df, format='table', min_itemsize={'A': 10, 'B': 10}) # doctest: +SKIP
1220
+ Usage with multi-byte characters:
1221
+ >>> df_unicode = pd.DataFrame([['香港', '北京']], columns=['city1', 'city2']) # doctest: +SKIP
1222
+ >>> # Each Chinese character is 3 bytes in UTF-8, so '香港' needs 6 bytes
1223
+ >>> store.put('cities', df_unicode, format='table', # doctest: +SKIP
1224
+ ... min_itemsize={'city1': 12, 'city2': 12}, encoding='utf-8') # doctest: +SKIP
1225
+ Determining the correct size for encoded strings:
1226
+ >>> text = '香港' # doctest: +SKIP
1227
+ >>> len(text) # Character length # doctest: +SKIP
1228
+ 2
1229
+ >>> len(text.encode('utf-8')) # Byte length # doctest: +SKIP
1230
+ 6
1231
+ >>> # Use the byte length for min_itemsize
1206
1232
"""
1207
1233
if format is None :
1208
1234
format = get_option ("io.hdf.default_format" ) or "fixed"
@@ -1330,8 +1356,17 @@ def append(
1330
1356
A value of 0 or None disables compression.
1331
1357
columns : default None
1332
1358
This parameter is currently not accepted, try data_columns.
1333
- min_itemsize : int, dict, or None
1334
- Dict of columns that specify minimum str sizes.
1359
+ min_itemsize : int, dict of str: int, or None, default None
1360
+ Minimum size in bytes for string columns. Can be:
1361
+ - int: Apply the same minimum size to all string columns
1362
+ - dict: Map column names to their minimum sizes
1363
+ - None: Use the existing table's column sizes
1364
+ **Important**: This parameter is only effective when creating a new table.
1365
+ If the table already exists, the column sizes are fixed and cannot be
1366
+ changed. The size refers to the number of bytes after encoding, not
1367
+ the number of characters.
1368
+ For multi-byte characters, calculate the size using the encoded byte length.
1369
+ For example: len('香'.encode('utf-8')) returns 3, not len('香') which returns 1.
1335
1370
nan_rep : str
1336
1371
Str to use as str nan representation.
1337
1372
chunksize : int or None
@@ -1364,6 +1399,10 @@ def append(
1364
1399
Does *not* check if data being appended overlaps with existing
1365
1400
data in the table, so be careful
1366
1401
1402
+ When appending to an existing table, the min_itemsize parameter has no effect
1403
+ as column sizes are already fixed. Set min_itemsize when initially creating
1404
+ the table with put() or the first append() call.
1405
+
1367
1406
Examples
1368
1407
--------
1369
1408
>>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
@@ -1377,6 +1416,38 @@ def append(
1377
1416
1 3 4
1378
1417
0 5 6
1379
1418
1 7 8
1419
+
1420
+ Creating a table and appending data:
1421
+
1422
+ >>> df1 = pd.DataFrame([['short', 'text']], columns=['A', 'B'])
1423
+ >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1424
+ >>> # Set min_itemsize when creating the table
1425
+ >>> store.put('data', df1, format='table', min_itemsize={'A': 20, 'B': 20}) # doctest: +SKIP
1426
+ >>>
1427
+ >>> df2 = pd.DataFrame([['longer text here', 'more text']], columns=['A', 'B'])
1428
+ >>> store.append('data', df2) # doctest: +SKIP
1429
+ >>> store.close() # doctest: +SKIP
1430
+
1431
+ Handling multi-byte characters:
1432
+
1433
+ >>> df_en = pd.DataFrame([['hello']], columns=['text'])
1434
+ >>> df_zh = pd.DataFrame([['你好世界']], columns=['text']) # "Hello World" in Chinese
1435
+ >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1436
+ >>> # Calculate size needed: len('你好世界'.encode('utf-8')) = 12 bytes
1437
+ >>> store.put('messages', df_en, format='table',
1438
+ ... min_itemsize={'text': 15}, encoding='utf-8') # doctest: +SKIP
1439
+ >>> store.append('messages', df_zh) # doctest: +SKIP
1440
+ >>> store.close() # doctest: +SKIP
1441
+
1442
+ Common error when min_itemsize is too small:
1443
+
1444
+ >>> df = pd.DataFrame([['香']], columns=['char']) # 3 bytes in UTF-8
1445
+ >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1446
+ >>> # This will raise ValueError: string length [3] exceeds limit [1]
1447
+ >>> # store.put('test', df, format='table', min_itemsize={'char': 1})
1448
+ >>> # Correct usage:
1449
+ >>> store.put('test', df, format='table', min_itemsize={'char': 3}) # doctest: +SKIP
1450
+ >>> store.close() # doctest: +SKIP
1380
1451
"""
1381
1452
if columns is not None :
1382
1453
raise TypeError (
0 commit comments