Skip to content

Commit 32b4aa6

Browse files
authored
Update pytables.py
1 parent e7eb6a1 commit 32b4aa6

File tree

1 file changed

+24
-69
lines changed

1 file changed

+24
-69
lines changed

pandas/io/pytables.py

Lines changed: 24 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,18 +1169,16 @@ def put(
11691169
This parameter is currently not accepted.
11701170
complevel : int, 0-9, default None
11711171
Specifies a compression level for data.
1172-
A value of 0 or None disables compression.
1172+
A value of 0 or None disables compression.
11731173
min_itemsize : int, dict of str: int, or None, default None
1174-
Minimum size in bytes for string columns. This parameter is only used when
1175-
format='table'. Can be:
1176-
- int: Apply the same minimum size to all string columns
1177-
- dict: Map column names to their minimum sizes
1178-
- None: Use default sizing
1179-
**Important**: The size refers to the number of bytes after encoding, not
1180-
the number of characters. For multi-byte characters (e.g., Chinese, Arabic),
1181-
you need to account for the encoding. For example, the character '香' is
1182-
1 character but 3 bytes when encoded as UTF-8
1183-
See examples below for proper usage with encoded strings.
1174+
Minimum size in bytes for string columns when format = 'table'.
1175+
int - Apply the same minimum size to all string columns,
1176+
dict - Map column names to their minimum sizes or,
1177+
None - use the default the sizing
1178+
Important: This specifies the byte length after encoding, not the
1179+
character count. For multi-byte characters, calculate the required
1180+
size using the encoded byte length.
1181+
See examples below for use.
11841182
nan_rep : str
11851183
Str to use as str nan representation.
11861184
data_columns : list of columns or True, default None
@@ -1213,22 +1211,9 @@ def put(
12131211
>>> store = pd.HDFStore("store.h5", "w") # doctest: +SKIP
12141212
>>> store.put("data", df) # doctest: +SKIP
12151213
1216-
Basic usage with ASCII strings:
1217-
>>> df = pd.DataFrame([['hello', 'world']], columns=['A', 'B'])
1218-
>>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1219-
>>> store.put('data', df, format='table', min_itemsize={'A': 10, 'B': 10}) # doctest: +SKIP
1220-
Usage with multi-byte characters:
1221-
>>> df_unicode = pd.DataFrame([['香港', '北京']], columns=['city1', 'city2']) # doctest: +SKIP
1222-
>>> # Each Chinese character is 3 bytes in UTF-8, so '香港' needs 6 bytes
1223-
>>> store.put('cities', df_unicode, format='table', # doctest: +SKIP
1224-
... min_itemsize={'city1': 12, 'city2': 12}, encoding='utf-8') # doctest: +SKIP
1225-
Determining the correct size for encoded strings:
1226-
>>> text = '香港' # doctest: +SKIP
1227-
>>> len(text) # Character length # doctest: +SKIP
1228-
2
1229-
>>> len(text.encode('utf-8')) # Byte length # doctest: +SKIP
1230-
6
1231-
>>> # Use the byte length for min_itemsize
1214+
>>> ASCII 'hello' = 5 bytes
1215+
>>> UTF-8 '香' = 3 bytes (though only 1 character)
1216+
>>> To find byte length: len(string.encode('utf-8'))
12321217
"""
12331218
if format is None:
12341219
format = get_option("io.hdf.default_format") or "fixed"
@@ -1355,18 +1340,16 @@ def append(
13551340
Specifies a compression level for data.
13561341
A value of 0 or None disables compression.
13571342
columns : default None
1358-
This parameter is currently not accepted, try data_columns.
1343+
This parameter is currently not accepted, try data_columns.
13591344
min_itemsize : int, dict of str: int, or None, default None
1360-
Minimum size in bytes for string columns. Can be:
1361-
- int: Apply the same minimum size to all string columns
1362-
- dict: Map column names to their minimum sizes
1363-
- None: Use the existing table's column sizes
1364-
**Important**: This parameter is only effective when creating a new table.
1365-
If the table already exists, the column sizes are fixed and cannot be
1366-
changed. The size refers to the number of bytes after encoding, not
1367-
the number of characters.
1368-
For multi-byte characters, calculate the size using the encoded byte length.
1369-
For example: len('香'.encode('utf-8')) returns 3, not len('香') which returns 1.
1345+
Minimum size in bytes for string columns when format = 'table'.
1346+
int - Apply the same minimum size to all string columns,
1347+
dict - Map column names to their minimum sizes or,
1348+
None - use the default the sizing
1349+
Important: This specifies the byte length after encoding, not the
1350+
character count. For multi-byte characters, calculate the required
1351+
size using the encoded byte length.
1352+
See examples below for use.
13701353
nan_rep : str
13711354
Str to use as str nan representation.
13721355
chunksize : int or None
@@ -1417,37 +1400,9 @@ def append(
14171400
0 5 6
14181401
1 7 8
14191402
1420-
Creating a table and appending data:
1421-
1422-
>>> df1 = pd.DataFrame([['short', 'text']], columns=['A', 'B'])
1423-
>>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1424-
>>> # Set min_itemsize when creating the table
1425-
>>> store.put('data', df1, format='table', min_itemsize={'A': 20, 'B': 20}) # doctest: +SKIP
1426-
>>>
1427-
>>> df2 = pd.DataFrame([['longer text here', 'more text']], columns=['A', 'B'])
1428-
>>> store.append('data', df2) # doctest: +SKIP
1429-
>>> store.close() # doctest: +SKIP
1430-
1431-
Handling multi-byte characters:
1432-
1433-
>>> df_en = pd.DataFrame([['hello']], columns=['text'])
1434-
>>> df_zh = pd.DataFrame([['你好世界']], columns=['text']) # "Hello World" in Chinese
1435-
>>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1436-
>>> # Calculate size needed: len('你好世界'.encode('utf-8')) = 12 bytes
1437-
>>> store.put('messages', df_en, format='table',
1438-
... min_itemsize={'text': 15}, encoding='utf-8') # doctest: +SKIP
1439-
>>> store.append('messages', df_zh) # doctest: +SKIP
1440-
>>> store.close() # doctest: +SKIP
1441-
1442-
Common error when min_itemsize is too small:
1443-
1444-
>>> df = pd.DataFrame([['香']], columns=['char']) # 3 bytes in UTF-8
1445-
>>> store = pd.HDFStore("store.h5", 'w') # doctest: +SKIP
1446-
>>> # This will raise ValueError: string length [3] exceeds limit [1]
1447-
>>> # store.put('test', df, format='table', min_itemsize={'char': 1})
1448-
>>> # Correct usage:
1449-
>>> store.put('test', df, format='table', min_itemsize={'char': 3}) # doctest: +SKIP
1450-
>>> store.close() # doctest: +SKIP
1403+
>>> ASCII 'hello' = 5 bytes
1404+
>>> UTF-8 '香' = 3 bytes (though only 1 character)
1405+
>>> To find byte length: len(string.encode('utf-8'))
14511406
"""
14521407
if columns is not None:
14531408
raise TypeError(

0 commit comments

Comments
 (0)