Skip to content

Commit e7eb6a1

Browse files
authored
Merge pull request #1 from JoeDediop/devBranch
Dev branch
2 parents eb489f2 + 769e23f commit e7eb6a1

File tree

1 file changed

+75
-4
lines changed

1 file changed

+75
-4
lines changed

pandas/io/pytables.py

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,8 +1170,17 @@ def put(
11701170
complevel : int, 0-9, default None
11711171
Specifies a compression level for data.
11721172
A value of 0 or None disables compression.
1173-
min_itemsize : int, dict, or None
1174-
Dict of columns that specify minimum str sizes.
1173+
min_itemsize : int, dict of str: int, or None, default None
1174+
Minimum size in bytes for string columns. This parameter is only used when
1175+
format='table'. Can be:
1176+
- int: Apply the same minimum size to all string columns
1177+
- dict: Map column names to their minimum sizes
1178+
- None: Use default sizing
1179+
**Important**: The size refers to the number of bytes after encoding, not
1180+
the number of characters. For multi-byte characters (e.g., Chinese, Arabic),
1181+
you need to account for the encoding. For example, the character '香' is
1182+
1 character but 3 bytes when encoded as UTF-8
1183+
See examples below for proper usage with encoded strings.
11751184
nan_rep : str
11761185
Str to use as str nan representation.
11771186
data_columns : list of columns or True, default None
@@ -1203,6 +1212,23 @@ def put(
12031212
>>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
12041213
>>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP
12051214
>>> store.put("data", df) # doctest: +SKIP
1215+
1216+
Basic usage with ASCII strings:
1217+
>>> df = pd.DataFrame([['hello', 'world']], columns=['A', 'B'])
1218+
>>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1219+
>>> store.put('data', df, format='table', min_itemsize={'A': 10, 'B': 10}) # doctest: +SKIP
1220+
Usage with multi-byte characters:
1221+
>>> df_unicode = pd.DataFrame([['香港', '北京']], columns=['city1', 'city2']) # doctest: +SKIP
1222+
>>> # Each Chinese character is 3 bytes in UTF-8, so '香港' needs 6 bytes
1223+
>>> store.put('cities', df_unicode, format='table', # doctest: +SKIP
1224+
... min_itemsize={'city1': 12, 'city2': 12}, encoding='utf-8') # doctest: +SKIP
1225+
Determining the correct size for encoded strings:
1226+
>>> text = '香港' # doctest: +SKIP
1227+
>>> len(text) # Character length # doctest: +SKIP
1228+
2
1229+
>>> len(text.encode('utf-8')) # Byte length # doctest: +SKIP
1230+
6
1231+
>>> # Use the byte length for min_itemsize
12061232
"""
12071233
if format is None:
12081234
format = get_option("io.hdf.default_format") or "fixed"
@@ -1330,8 +1356,17 @@ def append(
13301356
A value of 0 or None disables compression.
13311357
columns : default None
13321358
This parameter is currently not accepted, try data_columns.
1333-
min_itemsize : int, dict, or None
1334-
Dict of columns that specify minimum str sizes.
1359+
min_itemsize : int, dict of str: int, or None, default None
1360+
Minimum size in bytes for string columns. Can be:
1361+
- int: Apply the same minimum size to all string columns
1362+
- dict: Map column names to their minimum sizes
1363+
- None: Use the existing table's column sizes
1364+
**Important**: This parameter is only effective when creating a new table.
1365+
If the table already exists, the column sizes are fixed and cannot be
1366+
changed. The size refers to the number of bytes after encoding, not
1367+
the number of characters.
1368+
For multi-byte characters, calculate the size using the encoded byte length.
1369+
For example: len('香'.encode('utf-8')) returns 3, not len('香') which returns 1.
13351370
nan_rep : str
13361371
Str to use as str nan representation.
13371372
chunksize : int or None
@@ -1364,6 +1399,10 @@ def append(
13641399
Does *not* check if data being appended overlaps with existing
13651400
data in the table, so be careful
13661401
1402+
When appending to an existing table, the min_itemsize parameter has no effect
1403+
as column sizes are already fixed. Set min_itemsize when initially creating
1404+
the table with put() or the first append() call.
1405+
13671406
Examples
13681407
--------
13691408
>>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
@@ -1377,6 +1416,38 @@ def append(
13771416
1 3 4
13781417
0 5 6
13791418
1 7 8
1419+
1420+
Creating a table and appending data:
1421+
1422+
>>> df1 = pd.DataFrame([['short', 'text']], columns=['A', 'B'])
1423+
>>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1424+
>>> # Set min_itemsize when creating the table
1425+
>>> store.put('data', df1, format='table', min_itemsize={'A': 20, 'B': 20}) # doctest: +SKIP
1426+
>>>
1427+
>>> df2 = pd.DataFrame([['longer text here', 'more text']], columns=['A', 'B'])
1428+
>>> store.append('data', df2) # doctest: +SKIP
1429+
>>> store.close() # doctest: +SKIP
1430+
1431+
Handling multi-byte characters:
1432+
1433+
>>> df_en = pd.DataFrame([['hello']], columns=['text'])
1434+
>>> df_zh = pd.DataFrame([['你好世界']], columns=['text']) # "Hello World" in Chinese
1435+
>>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1436+
>>> # Calculate size needed: len('你好世界'.encode('utf-8')) = 12 bytes
1437+
>>> store.put('messages', df_en, format='table',
1438+
... min_itemsize={'text': 15}, encoding='utf-8') # doctest: +SKIP
1439+
>>> store.append('messages', df_zh) # doctest: +SKIP
1440+
>>> store.close() # doctest: +SKIP
1441+
1442+
Common error when min_itemsize is too small:
1443+
1444+
>>> df = pd.DataFrame([['香']], columns=['char']) # 3 bytes in UTF-8
1445+
>>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1446+
>>> # This will raise ValueError: string length [3] exceeds limit [1]
1447+
>>> # store.put('test', df, format='table', min_itemsize={'char': 1})
1448+
>>> # Correct usage:
1449+
>>> store.put('test', df, format='table', min_itemsize={'char': 3}) # doctest: +SKIP
1450+
>>> store.close() # doctest: +SKIP
13801451
"""
13811452
if columns is not None:
13821453
raise TypeError(

0 commit comments

Comments
 (0)