@@ -1169,18 +1169,16 @@ def put(
1169
1169
This parameter is currently not accepted.
1170
1170
complevel : int, 0-9, default None
1171
1171
Specifies a compression level for data.
1172
- A value of 0 or None disables compression.
1172
+ A value of 0 or None disables compression.
1173
1173
min_itemsize : int, dict of str: int, or None, default None
1174
- Minimum size in bytes for string columns. This parameter is only used when
1175
- format='table'. Can be:
1176
- - int: Apply the same minimum size to all string columns
1177
- - dict: Map column names to their minimum sizes
1178
- - None: Use default sizing
1179
- **Important**: The size refers to the number of bytes after encoding, not
1180
- the number of characters. For multi-byte characters (e.g., Chinese, Arabic),
1181
- you need to account for the encoding. For example, the character '香' is
1182
- 1 character but 3 bytes when encoded as UTF-8
1183
- See examples below for proper usage with encoded strings.
1174
+ Minimum size in bytes for string columns when format = 'table'.
1175
+ int - Apply the same minimum size to all string columns,
1176
+ dict - Map column names to their minimum sizes or,
1177
+ None - use the default the sizing
1178
+ Important: This specifies the byte length after encoding, not the
1179
+ character count. For multi-byte characters, calculate the required
1180
+ size using the encoded byte length.
1181
+ See examples below for use.
1184
1182
nan_rep : str
1185
1183
Str to use as str nan representation.
1186
1184
data_columns : list of columns or True, default None
@@ -1213,22 +1211,9 @@ def put(
1213
1211
>>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP
1214
1212
>>> store.put("data", df) # doctest: +SKIP
1215
1213
1216
- Basic usage with ASCII strings:
1217
- >>> df = pd.DataFrame([['hello', 'world']], columns=['A', 'B'])
1218
- >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1219
- >>> store.put('data', df, format='table', min_itemsize={'A': 10, 'B': 10}) # doctest: +SKIP
1220
- Usage with multi-byte characters:
1221
- >>> df_unicode = pd.DataFrame([['香港', '北京']], columns=['city1', 'city2']) # doctest: +SKIP
1222
- >>> # Each Chinese character is 3 bytes in UTF-8, so '香港' needs 6 bytes
1223
- >>> store.put('cities', df_unicode, format='table', # doctest: +SKIP
1224
- ... min_itemsize={'city1': 12, 'city2': 12}, encoding='utf-8') # doctest: +SKIP
1225
- Determining the correct size for encoded strings:
1226
- >>> text = '香港' # doctest: +SKIP
1227
- >>> len(text) # Character length # doctest: +SKIP
1228
- 2
1229
- >>> len(text.encode('utf-8')) # Byte length # doctest: +SKIP
1230
- 6
1231
- >>> # Use the byte length for min_itemsize
1214
+ >>> ASCII 'hello' = 5 bytes
1215
+ >>> UTF-8 '香' = 3 bytes (though only 1 character)
1216
+ >>> To find byte length: len(string.encode('utf-8'))
1232
1217
"""
1233
1218
if format is None :
1234
1219
format = get_option ("io.hdf.default_format" ) or "fixed"
@@ -1355,18 +1340,16 @@ def append(
1355
1340
Specifies a compression level for data.
1356
1341
A value of 0 or None disables compression.
1357
1342
columns : default None
1358
- This parameter is currently not accepted, try data_columns.
1343
+ This parameter is currently not accepted, try data_columns.
1359
1344
min_itemsize : int, dict of str: int, or None, default None
1360
- Minimum size in bytes for string columns. Can be:
1361
- - int: Apply the same minimum size to all string columns
1362
- - dict: Map column names to their minimum sizes
1363
- - None: Use the existing table's column sizes
1364
- **Important**: This parameter is only effective when creating a new table.
1365
- If the table already exists, the column sizes are fixed and cannot be
1366
- changed. The size refers to the number of bytes after encoding, not
1367
- the number of characters.
1368
- For multi-byte characters, calculate the size using the encoded byte length.
1369
- For example: len('香'.encode('utf-8')) returns 3, not len('香') which returns 1.
1345
+ Minimum size in bytes for string columns when format = 'table'.
1346
+ int - Apply the same minimum size to all string columns,
1347
+ dict - Map column names to their minimum sizes or,
1348
+ None - use the default the sizing
1349
+ Important: This specifies the byte length after encoding, not the
1350
+ character count. For multi-byte characters, calculate the required
1351
+ size using the encoded byte length.
1352
+ See examples below for use.
1370
1353
nan_rep : str
1371
1354
Str to use as str nan representation.
1372
1355
chunksize : int or None
@@ -1417,37 +1400,9 @@ def append(
1417
1400
0 5 6
1418
1401
1 7 8
1419
1402
1420
- Creating a table and appending data:
1421
-
1422
- >>> df1 = pd.DataFrame([['short', 'text']], columns=['A', 'B'])
1423
- >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1424
- >>> # Set min_itemsize when creating the table
1425
- >>> store.put('data', df1, format='table', min_itemsize={'A': 20, 'B': 20}) # doctest: +SKIP
1426
- >>>
1427
- >>> df2 = pd.DataFrame([['longer text here', 'more text']], columns=['A', 'B'])
1428
- >>> store.append('data', df2) # doctest: +SKIP
1429
- >>> store.close() # doctest: +SKIP
1430
-
1431
- Handling multi-byte characters:
1432
-
1433
- >>> df_en = pd.DataFrame([['hello']], columns=['text'])
1434
- >>> df_zh = pd.DataFrame([['你好世界']], columns=['text']) # "Hello World" in Chinese
1435
- >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1436
- >>> # Calculate size needed: len('你好世界'.encode('utf-8')) = 12 bytes
1437
- >>> store.put('messages', df_en, format='table',
1438
- ... min_itemsize={'text': 15}, encoding='utf-8') # doctest: +SKIP
1439
- >>> store.append('messages', df_zh) # doctest: +SKIP
1440
- >>> store.close() # doctest: +SKIP
1441
-
1442
- Common error when min_itemsize is too small:
1443
-
1444
- >>> df = pd.DataFrame([['香']], columns=['char']) # 3 bytes in UTF-8
1445
- >>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1446
- >>> # This will raise ValueError: string length [3] exceeds limit [1]
1447
- >>> # store.put('test', df, format='table', min_itemsize={'char': 1})
1448
- >>> # Correct usage:
1449
- >>> store.put('test', df, format='table', min_itemsize={'char': 3}) # doctest: +SKIP
1450
- >>> store.close() # doctest: +SKIP
1403
+ >>> ASCII 'hello' = 5 bytes
1404
+ >>> UTF-8 '香' = 3 bytes (though only 1 character)
1405
+ >>> To find byte length: len(string.encode('utf-8'))
1451
1406
"""
1452
1407
if columns is not None :
1453
1408
raise TypeError (
0 commit comments