parallel to_sql() (#461)

eavidan · devin-petersohn · commit c65f3966a1ed · 2019-02-16T09:03:50.000-08:00
* initial working implementation * Fix issue in #453#issuecomment-461130825 * initial working implementation * Fix issue in #453#issuecomment-461130825 * alignment with @devin-petersohn comments documentation and unit tests * to_sql removed from pandas_query_compiler. moved to io.py * to_sql added to base/io.py to support other engines by defaulting to pandas * align base io to_sql signature to contain qc as well * fixed base io to_sql to first convert qc into pandas dataframe and then run to_sql() * clean up tests to follow pytest best practices * Undo unnecessary changes * Fixed linting and made fixture into a factory * more linting
diff --git a/modin/data_management/factories.py b/modin/data_management/factories.py
@@ -154,6 +154,14 @@ def read_sql(cls, **kwargs):
     def _read_sql(cls, **kwargs):
         return cls.io_cls.read_sql(**kwargs)
 
+    @classmethod
+    def to_sql(cls, *args, **kwargs):
+        return cls._determine_engine()._to_sql(*args, **kwargs)
+
+    @classmethod
+    def _to_sql(cls, *args, **kwargs):
+        return cls.io_cls.to_sql(*args, **kwargs)
+
 
 class PandasOnRayFactory(BaseFactory):
 
diff --git a/modin/engines/base/io.py b/modin/engines/base/io.py
@@ -422,3 +422,29 @@ def read_sql(
                 chunksize=chunksize,
             )
         )
+
+    @classmethod
+    def to_sql(
+        cls,
+        qc,
+        name,
+        con,
+        schema=None,
+        if_exists="fail",
+        index=True,
+        index_label=None,
+        chunksize=None,
+        dtype=None,
+    ):
+        ErrorMessage.default_to_pandas("`to_sql`")
+        df = qc.to_pandas()
+        df.to_sql(
+            name=name,
+            con=con,
+            schema=schema,
+            if_exists=if_exists,
+            index=index,
+            index_label=index_label,
+            chunksize=chunksize,
+            dtype=dtype,
+        )
diff --git a/modin/engines/ray/pandas_on_ray/io.py b/modin/engines/ray/pandas_on_ray/io.py
@@ -538,6 +538,35 @@ def read_feather(cls, path, nthreads=1, columns=None):
         )
         return new_query_compiler
 
+    @classmethod
+    def to_sql(cls, qc, **kwargs):
+        """Write records stored in a DataFrame to a SQL database.
+        Args:
+            qc: the query compiler of the DF that we want to run to_sql on
+            kwargs: parameters for pandas.to_sql(**kwargs)
+        """
+        # we first insert an empty DF in order to create the full table in the database
+        # This also helps to validate the input against pandas
+        # we would like to_sql() to complete only when all rows have been inserted into the database
+        # since the mapping operation is non-blocking, each partition will return an empty DF
+        # so at the end, the blocking operation will be this empty DF to_pandas
+
+        empty_df = qc.head(1).to_pandas().head(0)
+        empty_df.to_sql(**kwargs)
+        # so each partition will append its respective DF
+        kwargs["if_exists"] = "append"
+        columns = qc.columns
+
+        def func(df, **kwargs):
+            df.columns = columns
+            df.to_sql(**kwargs)
+            return pandas.DataFrame()
+
+        map_func = qc._prepare_method(func, **kwargs)
+        result = qc.map_across_full_axis(1, map_func)
+        # blocking operation
+        result.to_pandas()
+
 
 @ray.remote
 def get_index(index_name, *partition_indices):  # pragma: no cover
diff --git a/modin/experimental/pandas/test/test_io_exp.py b/modin/experimental/pandas/test/test_io_exp.py
@@ -2,53 +2,42 @@
 import pandas
 import pytest
 import modin.experimental.pandas as pd
-
-
-from modin.pandas.test.test_io import (
-    setup_sql_file,
-    teardown_sql_file,
+from modin.pandas.test.test_io import (  # noqa: F401
     modin_df_equals_pandas,
+    make_sql_connection,
 )
 
 
-def test_from_sql_distributed():
+def test_from_sql_distributed(make_sql_connection):  # noqa: F811
     if os.environ.get("MODIN_ENGINE", "") == "Ray":
         filename = "test_from_sql_distributed.db"
-        teardown_sql_file(filename)
         table = "test_from_sql_distributed"
-        db_uri = "sqlite:///" + filename
-        setup_sql_file(db_uri, filename, table, True)
+        conn = make_sql_connection(filename, table)
         query = "select * from {0}".format(table)
 
-        pandas_df = pandas.read_sql(query, db_uri)
+        pandas_df = pandas.read_sql(query, conn)
         modin_df_from_query = pd.read_sql(
-            query, db_uri, partition_column="col1", lower_bound=0, upper_bound=6
+            query, conn, partition_column="col1", lower_bound=0, upper_bound=6
         )
         modin_df_from_table = pd.read_sql(
-            table, db_uri, partition_column="col1", lower_bound=0, upper_bound=6
+            table, conn, partition_column="col1", lower_bound=0, upper_bound=6
         )
 
         assert modin_df_equals_pandas(modin_df_from_query, pandas_df)
         assert modin_df_equals_pandas(modin_df_from_table, pandas_df)
 
-        teardown_sql_file(filename)
 
-
-def test_from_sql_defaults():
+def test_from_sql_defaults(make_sql_connection):  # noqa: F811
     filename = "test_from_sql_distributed.db"
-    teardown_sql_file(filename)
     table = "test_from_sql_distributed"
-    db_uri = "sqlite:///" + filename
-    setup_sql_file(db_uri, filename, table, True)
+    conn = make_sql_connection(filename, table)
     query = "select * from {0}".format(table)
 
-    pandas_df = pandas.read_sql(query, db_uri)
+    pandas_df = pandas.read_sql(query, conn)
     with pytest.warns(UserWarning):
-        modin_df_from_query = pd.read_sql(query, db_uri)
+        modin_df_from_query = pd.read_sql(query, conn)
     with pytest.warns(UserWarning):
-        modin_df_from_table = pd.read_sql(table, db_uri)
+        modin_df_from_table = pd.read_sql(table, conn)
 
     assert modin_df_equals_pandas(modin_df_from_query, pandas_df)
     assert modin_df_equals_pandas(modin_df_from_table, pandas_df)
-
-    teardown_sql_file(filename)
diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
@@ -4188,25 +4188,34 @@ def to_sql(
         self,
         name,
         con,
-        flavor=None,
         schema=None,
         if_exists="fail",
         index=True,
         index_label=None,
         chunksize=None,
         dtype=None,
-    ):  # pragma: no cover
-        return self._default_to_pandas(
-            pandas.DataFrame.to_sql,
-            name,
-            con,
-            flavor,
-            schema,
-            if_exists,
-            index,
-            index_label,
-            chunksize,
-            dtype,
+    ):
+        new_query_compiler = self._query_compiler
+        # writing the index to the database by inserting it to the DF
+        if index:
+            if not index_label:
+                index_label = "index"
+            new_query_compiler = new_query_compiler.insert(0, index_label, self.index)
+            # so pandas._to_sql will not write the index to the database as well
+            index = False
+
+        from modin.data_management.factories import BaseFactory
+
+        BaseFactory.to_sql(
+            new_query_compiler,
+            name=name,
+            con=con,
+            schema=schema,
+            if_exists=if_exists,
+            index=index,
+            index_label=index_label,
+            chunksize=chunksize,
+            dtype=dtype,
         )
 
     def to_stata(
diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
@@ -10,7 +10,6 @@
 from pathlib import Path
 import pyarrow as pa
 import os
-import sqlite3
 import sys
 
 # needed to resolve ray-project/ray#3744
@@ -47,7 +46,6 @@ def setup_parquet_file(row_size, force=False):
         ).to_parquet(TEST_PARQUET_FILENAME)
 
 
-@pytest.fixture
 def create_test_ray_dataframe():
     df = pd.DataFrame(
         {
@@ -62,7 +60,6 @@ def create_test_ray_dataframe():
     return df
 
 
-@pytest.fixture
 def create_test_pandas_dataframe():
     df = pandas.DataFrame(
         {
@@ -261,26 +258,41 @@ def teardown_pickle_file():
 
 
 @pytest.fixture
-def setup_sql_file(conn, filename, table, force=False):
-    if os.path.exists(filename) and not force:
-        pass
-    else:
-        df = pandas.DataFrame(
-            {
-                "col1": [0, 1, 2, 3, 4, 5, 6],
-                "col2": [7, 8, 9, 10, 11, 12, 13],
-                "col3": [14, 15, 16, 17, 18, 19, 20],
-                "col4": [21, 22, 23, 24, 25, 26, 27],
-                "col5": [0, 0, 0, 0, 0, 0, 0],
-            }
-        )
-        df.to_sql(table, conn)
-
-
-@pytest.fixture
-def teardown_sql_file(filename):
-    if os.path.exists(filename):
-        os.remove(filename)
+def make_sql_connection():
+    """Sets up sql connections and takes them down after the caller is done.
+
+    Yields:
+        Factory that generates sql connection objects
+    """
+    filenames = []
+
+    def _sql_connection(filename, table=""):
+        # Remove file if exists
+        if os.path.exists(filename):
+            os.remove(filename)
+        filenames.append(filename)
+
+        # Create connection and, if needed, table
+        conn = "sqlite:///{}".format(filename)
+        if table:
+            df = pandas.DataFrame(
+                {
+                    "col1": [0, 1, 2, 3, 4, 5, 6],
+                    "col2": [7, 8, 9, 10, 11, 12, 13],
+                    "col3": [14, 15, 16, 17, 18, 19, 20],
+                    "col4": [21, 22, 23, 24, 25, 26, 27],
+                    "col5": [0, 0, 0, 0, 0, 0, 0],
+                }
+            )
+            df.to_sql(table, conn)
+        return conn
+
+    yield _sql_connection
+
+    # Takedown the fixture
+    for filename in filenames:
+        if os.path.exists(filename):
+            os.remove(filename)
 
 
 def test_from_parquet():
@@ -460,21 +472,17 @@ def test_from_pickle():
     teardown_pickle_file()
 
 
-def test_from_sql():
+def test_from_sql(make_sql_connection):
     filename = "test_from_sql.db"
-    teardown_sql_file(filename)
-    conn = sqlite3.connect(filename)
     table = "test_from_sql"
-    setup_sql_file(conn, filename, table, True)
+    conn = make_sql_connection(filename, table)
     query = "select * from {0}".format(table)
 
     pandas_df = pandas.read_sql(query, conn)
     modin_df = pd.read_sql(query, conn)
 
     assert modin_df_equals_pandas(modin_df, pandas_df)
 
-    teardown_sql_file(filename)
-
 
 @pytest.mark.skip(reason="No SAS write methods in Pandas")
 def test_from_sas():
@@ -750,20 +758,40 @@ def test_to_pickle():
     teardown_test_file(TEST_PICKLE_DF_FILENAME)
 
 
-def test_to_sql():
+def test_to_sql_without_index(make_sql_connection):
+    table_name = "tbl_without_index"
     modin_df = create_test_ray_dataframe()
     pandas_df = create_test_pandas_dataframe()
 
-    TEST_SQL_DF_FILENAME = "test_df.sql"
-    TEST_SQL_pandas_FILENAME = "test_pandas.sql"
+    # We do not pass the table name so the fixture won't generate a table
+    conn = make_sql_connection("test_to_sql.db")
+    modin_df.to_sql(table_name, conn, index=False)
+    df_modin_sql = pandas.read_sql(table_name, con=conn)
+
+    # We do not pass the table name so the fixture won't generate a table
+    conn = make_sql_connection("test_to_sql_pandas.db")
+    pandas_df.to_sql(table_name, conn, index=False)
+    df_pandas_sql = pandas.read_sql(table_name, con=conn)
+
+    assert df_modin_sql.sort_index().equals(df_pandas_sql.sort_index())
+
+
+def test_to_sql_with_index(make_sql_connection):
+    table_name = "tbl_with_index"
+    modin_df = create_test_ray_dataframe()
+    pandas_df = create_test_pandas_dataframe()
 
-    modin_df.to_pickle(TEST_SQL_DF_FILENAME)
-    pandas_df.to_pickle(TEST_SQL_pandas_FILENAME)
+    # We do not pass the table name so the fixture won't generate a table
+    conn = make_sql_connection("test_to_sql.db")
+    modin_df.to_sql(table_name, conn)
+    df_modin_sql = pandas.read_sql(table_name, con=conn, index_col="index")
 
-    assert test_files_eq(TEST_SQL_DF_FILENAME, TEST_SQL_pandas_FILENAME)
+    # We do not pass the table name so the fixture won't generate a table
+    conn = make_sql_connection("test_to_sql_pandas.db")
+    pandas_df.to_sql(table_name, conn)
+    df_pandas_sql = pandas.read_sql(table_name, con=conn, index_col="index")
 
-    teardown_test_file(TEST_SQL_DF_FILENAME)
-    teardown_test_file(TEST_SQL_pandas_FILENAME)
+    assert df_modin_sql.sort_index().equals(df_pandas_sql.sort_index())
 
 
 def test_to_stata():