Skip to content

Commit 4c41d08

Browse files
committed
HDF copy object docstrings
1 parent 2b40269 commit 4c41d08

File tree

1 file changed

+95
-0
lines changed

1 file changed

+95
-0
lines changed

pandas_to_postgres/copy_hdf.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@
1414

1515

1616
class HDFTableCopy(BaseCopy):
17+
"""
18+
Class for handling a standard case of reading a table from an HDF file into a pandas
19+
DataFrame, iterating over it in chunks, and COPYing to PostgreSQL via StringIO CSV
20+
"""
21+
1722
def __init__(
1823
self,
1924
hdf_tables: List[str],
@@ -24,6 +29,19 @@ def __init__(
2429
sql_table: str = None,
2530
csv_chunksize: int = 10 ** 6,
2631
):
32+
"""
33+
Parameters
34+
----------
35+
hdf_tables: list of HDF keys with data corresponding to destination SQL table
36+
(assumption being that HDF tables:SQL tables is many:one)
37+
hdf_meta: HDFMetadata object with information from the store
38+
defer_sql_objs: multiprocessing has issue with passing SQLALchemy objects, so if
39+
True, defer attributing these to the object until after pickled by Pool
40+
conn: SQLAlchemy connection managed outside of the object
41+
table_obj: SQLAlchemy object for the destination SQL Table
42+
sql_table: string of SQL table name
43+
csv_chunksize: max rows to keep in memory when generating CSV for COPY
44+
"""
2745
super().__init__(defer_sql_objs, conn, table_obj, sql_table, csv_chunksize)
2846

2947
self.hdf_tables = hdf_tables
@@ -34,6 +52,17 @@ def __init__(
3452
self.hdf_chunksize = hdf_meta.chunksize
3553

3654
def copy(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
55+
"""
56+
Go through sequence to COPY data to PostgreSQL table, including dropping Primary
57+
and Foreign Keys to optimize speed, TRUNCATE table, COPY data, recreate keys,
58+
and run ANALYZE.
59+
60+
Parameters
61+
----------
62+
data_formatters: list of functions to apply to df during sequence. Note that
63+
each of these functions should be able to handle kwargs for one another
64+
data_formatter_kwargs: list of kwargs to pass to data_formatters functions
65+
"""
3766
self.drop_fks()
3867
self.drop_pk()
3968

@@ -50,6 +79,15 @@ def copy(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
5079
self.analyze()
5180

5281
def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
82+
"""
83+
Copy each HDF table that relates to SQL table to database
84+
85+
Parameters
86+
----------
87+
data_formatters: list of functions to apply to df during sequence. Note that
88+
each of these functions should be able to handle kwargs for one another
89+
data_formatter_kwargs: list of kwargs to pass to data_formatters functions
90+
"""
5391
if self.hdf_tables is None:
5492
logger.warn(f"No HDF table found for SQL table {self.sql_table}")
5593
return
@@ -81,6 +119,11 @@ def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
81119

82120

83121
class SmallHDFTableCopy(HDFTableCopy):
122+
"""
123+
Class for handling the case where the table is small enough to be stored completely
124+
in-memory for both reading from the HDF as well as COPYing using StringIO.
125+
"""
126+
84127
def __init__(
85128
self,
86129
hdf_tables: List[str],
@@ -91,6 +134,19 @@ def __init__(
91134
sql_table: str = None,
92135
csv_chunksize: int = 10 ** 6,
93136
):
137+
"""
138+
Parameters
139+
----------
140+
hdf_tables: list of HDF keys with data corresponding to destination SQL table
141+
(assumption being that HDF tables:SQL tables is many:one)
142+
hdf_meta: HDFMetadata object with information from the store
143+
defer_sql_objs: multiprocessing has issue with passing SQLALchemy objects, so if
144+
True, defer attributing these to the object until after pickled by Pool
145+
conn: SQLAlchemy connection managed outside of the object
146+
table_obj: SQLAlchemy object for the destination SQL Table
147+
sql_table: string of SQL table name
148+
csv_chunksize: max rows to keep in memory when generating CSV for COPY
149+
"""
94150
super().__init__(
95151
hdf_tables,
96152
hdf_meta,
@@ -102,6 +158,15 @@ def __init__(
102158
)
103159

104160
def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
161+
"""
162+
Copy each HDF table that relates to SQL table to database
163+
164+
Parameters
165+
----------
166+
data_formatters: list of functions to apply to df during sequence. Note that
167+
each of these functions should be able to handle kwargs for one another
168+
data_formatter_kwargs: list of kwargs to pass to data_formatters functions
169+
"""
105170
if self.hdf_tables is None:
106171
logger.warn("No HDF table found for SQL table {self.sql_table}")
107172
return
@@ -129,6 +194,14 @@ def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
129194

130195

131196
class BigHDFTableCopy(HDFTableCopy):
197+
"""
198+
Class for handling the special case of particularly large tables. For these, we
199+
iterate over reading the table in the HDF as well as iterating again over each of
200+
those chunks in order to keep the number of rows stored in-memory to a reasonable
201+
size. Note that these are iterated using pd.read_hdf(..., start, stop) rather than
202+
pd.read_hdf(..., iterator=True) because we found the performance was much better.
203+
"""
204+
132205
def __init__(
133206
self,
134207
hdf_tables: List[str],
@@ -139,6 +212,19 @@ def __init__(
139212
sql_table: str = None,
140213
csv_chunksize: int = 10 ** 6,
141214
):
215+
"""
216+
Parameters
217+
----------
218+
hdf_tables: list of HDF keys with data corresponding to destination SQL table
219+
(assumption being that HDF tables:SQL tables is many:one)
220+
hdf_meta: HDFMetadata object with information from the store
221+
defer_sql_objs: multiprocessing has issue with passing SQLALchemy objects, so if
222+
True, defer attributing these to the object until after pickled by Pool
223+
conn: SQLAlchemy connection managed outside of the object
224+
table_obj: SQLAlchemy object for the destination SQL Table
225+
sql_table: string of SQL table name
226+
csv_chunksize: max rows to keep in memory when generating CSV for COPY
227+
"""
142228
super().__init__(
143229
hdf_tables,
144230
hdf_meta,
@@ -150,6 +236,15 @@ def __init__(
150236
)
151237

152238
def hdf_to_pg(self, data_formatters=[cast_pandas], data_formatter_kwargs={}):
239+
"""
240+
Copy each HDF table that relates to SQL table to database
241+
242+
Parameters
243+
----------
244+
data_formatters: list of functions to apply to df during sequence. Note that
245+
each of these functions should be able to handle kwargs for one another
246+
data_formatter_kwargs: list of kwargs to pass to data_formatters functions
247+
"""
153248
if self.hdf_tables is None:
154249
logger.warn(f"No HDF table found for SQL table {self.sql_table}")
155250
return

0 commit comments

Comments
 (0)