Merge pull request #1 from JoeDediop/devBranch

JoeDediop · web-flow · commit e7eb6a16d4e3 · 2025-08-06T09:53:51.000-07:00
Dev branch
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -1170,8 +1170,17 @@ def put(
         complevel : int, 0-9, default None
             Specifies a compression level for data.
             A value of 0 or None disables compression.
-        min_itemsize : int, dict, or None
-            Dict of columns that specify minimum str sizes.
+        min_itemsize : int, dict of str: int, or None, default None
+            Minimum size in bytes for string columns. This parameter is only used when
+            format='table'. Can be:
+            - int: Apply the same minimum size to all string columns
+            - dict: Map column names to their minimum sizes
+            - None: Use default sizing
+            **Important**: The size refers to the number of bytes after encoding, not
+            the number of characters. For multi-byte characters (e.g., Chinese, Arabic),
+            you need to account for the encoding. For example, the character '香' is
+            1 character but 3 bytes when encoded as UTF-8
+            See examples below for proper usage with encoded strings. 
         nan_rep : str
             Str to use as str nan representation.
         data_columns : list of columns or True, default None
@@ -1203,6 +1212,23 @@ def put(
         >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
         >>> store = pd.HDFStore("store.h5", "w")  # doctest: +SKIP
         >>> store.put("data", df)  # doctest: +SKIP
+
+        Basic usage with ASCII strings:
+        >>> df = pd.DataFrame([['hello', 'world']], columns=['A', 'B'])
+        >>> store = pd.HDFStore("store.h5", 'w')  # doctest: +SKIP
+        >>> store.put('data', df, format='table', min_itemsize={'A': 10, 'B': 10})  # doctest: +SKIP
+        Usage with multi-byte characters:
+        >>> df_unicode = pd.DataFrame([['香港', '北京']], columns=['city1', 'city2'])  # doctest: +SKIP
+        >>> # Each Chinese character is 3 bytes in UTF-8, so '香港' needs 6 bytes
+        >>> store.put('cities', df_unicode, format='table',  # doctest: +SKIP
+        ...           min_itemsize={'city1': 12, 'city2': 12}, encoding='utf-8')  # doctest: +SKIP
+        Determining the correct size for encoded strings:
+        >>> text = '香港'  # doctest: +SKIP
+        >>> len(text)  # Character length  # doctest: +SKIP
+        2
+        >>> len(text.encode('utf-8'))  # Byte length  # doctest: +SKIP
+        6
+        >>> # Use the byte length for min_itemsize
         """
         if format is None:
             format = get_option("io.hdf.default_format") or "fixed"
@@ -1330,8 +1356,17 @@ def append(
             A value of 0 or None disables compression.
         columns : default None
             This parameter is currently not accepted, try data_columns.
-        min_itemsize : int, dict, or None
-            Dict of columns that specify minimum str sizes.
+        min_itemsize : int, dict of str: int, or None, default None
+            Minimum size in bytes for string columns. Can be:
+            - int: Apply the same minimum size to all string columns
+            - dict: Map column names to their minimum sizes  
+            - None: Use the existing table's column sizes
+             **Important**: This parameter is only effective when creating a new table.
+            If the table already exists, the column sizes are fixed and cannot be
+            changed. The size refers to the number of bytes after encoding, not
+            the number of characters.
+            For multi-byte characters, calculate the size using the encoded byte length. 
+            For example: len('香'.encode('utf-8')) returns 3, not len('香') which returns 1.
         nan_rep : str
             Str to use as str nan representation.
         chunksize : int or None
@@ -1364,6 +1399,10 @@ def append(
         Does *not* check if data being appended overlaps with existing
         data in the table, so be careful
 
+        When appending to an existing table, the min_itemsize parameter has no effect
+        as column sizes are already fixed. Set min_itemsize when initially creating
+        the table with put() or the first append() call.
+
         Examples
         --------
         >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
@@ -1377,6 +1416,38 @@ def append(
         1  3  4
         0  5  6
         1  7  8
+
+        Creating a table and appending data:
+    
+        >>> df1 = pd.DataFrame([['short', 'text']], columns=['A', 'B'])
+        >>> store = pd.HDFStore("store.h5", 'w')  # doctest: +SKIP
+        >>> # Set min_itemsize when creating the table
+        >>> store.put('data', df1, format='table', min_itemsize={'A': 20, 'B': 20})  # doctest: +SKIP
+        >>> 
+        >>> df2 = pd.DataFrame([['longer text here', 'more text']], columns=['A', 'B'])
+        >>> store.append('data', df2)  # doctest: +SKIP
+        >>> store.close()  # doctest: +SKIP
+        
+        Handling multi-byte characters:
+        
+        >>> df_en = pd.DataFrame([['hello']], columns=['text'])
+        >>> df_zh = pd.DataFrame([['你好世界']], columns=['text'])  # "Hello World" in Chinese
+        >>> store = pd.HDFStore("store.h5", 'w')  # doctest: +SKIP
+        >>> # Calculate size needed: len('你好世界'.encode('utf-8')) = 12 bytes
+        >>> store.put('messages', df_en, format='table', 
+        ...           min_itemsize={'text': 15}, encoding='utf-8')  # doctest: +SKIP
+        >>> store.append('messages', df_zh)  # doctest: +SKIP
+        >>> store.close()  # doctest: +SKIP
+        
+        Common error when min_itemsize is too small:
+        
+        >>> df = pd.DataFrame([['香']], columns=['char'])  # 3 bytes in UTF-8
+        >>> store = pd.HDFStore("store.h5", 'w')  # doctest: +SKIP
+        >>> # This will raise ValueError: string length [3] exceeds limit [1]
+        >>> # store.put('test', df, format='table', min_itemsize={'char': 1})
+        >>> # Correct usage:
+        >>> store.put('test', df, format='table', min_itemsize={'char': 3})  # doctest: +SKIP
+        >>> store.close()  # doctest: +SKIP
         """
         if columns is not None:
             raise TypeError(