From 86b2ad98134590f12dfa7af3ee60c01a4cad60db Mon Sep 17 00:00:00 2001 From: Hanxin Chen Date: Mon, 18 Dec 2023 01:22:34 -0500 Subject: [PATCH 1/8] add docs for read_sql to avoid sql injection --- pandas/io/sql.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index a83c2bf241450..01e0eea37c02d 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -644,6 +644,37 @@ def read_sql( read_sql_table : Read SQL database table into a DataFrame. read_sql_query : Read SQL query into a DataFrame. + Notes + ----- + Using string interpolation (e.g. ``f-strings``, ``%-formatting``, + ``str.format()``, etc.) in a SQL query may cause SQL injection. + For example, the code below will insert unexpected data into ``test_data`` table. + + >>> from sqlite3 import connect + >>> from sqlalchemy import create_engine + >>> engine = create_engine('postgresql:///test_db') + >>> conn = engine.connect() + + >>> df = pd.DataFrame(data=[[0, '10/11/12'], [1, '12/11/10']], + ... columns=['int_column', 'date_column']) + >>> df.to_sql(name='test_data', con=conn) + 2 + + >>> # DON'T DO THIS + >>> query_int = "1; INSERT INTO test_data VALUES (2, 2, '09/11/12') RETURNING *;" + >>> pd.read_sql(f'SELECT * FROM test_data WHERE int_column={query_int}', conn) + index int_column date_column + 0 2 2 09/11/12 + >>> conn.commit() + + Instead, use the ``params`` argument: + + >>> from sqlalchemy import text + >>> sql = text('SELECT * FROM test_data WHERE int_column=:int_val') + >>> pd.read_sql(sql, conn, params={'int_val': 1}) + index int_column date_column + 0 1 1 12/11/10 + Examples -------- Read data from SQL via either a SQL query or a SQL tablename. From 5909846ecd3321b95709ffb0914c06caa4d98ef3 Mon Sep 17 00:00:00 2001 From: Hanxin Chen Date: Mon, 18 Dec 2023 01:37:39 -0500 Subject: [PATCH 2/8] resolve formatting --- pandas/io/sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 01e0eea37c02d..a4597558a8991 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -646,7 +646,7 @@ def read_sql( Notes ----- - Using string interpolation (e.g. ``f-strings``, ``%-formatting``, + Using string interpolation (e.g. ``f-strings``, ``%-formatting``, ``str.format()``, etc.) in a SQL query may cause SQL injection. For example, the code below will insert unexpected data into ``test_data`` table. From 206b52ff8930f0fbeabbad98c5b5e8900685b896 Mon Sep 17 00:00:00 2001 From: erichxchen Date: Tue, 26 Dec 2023 16:04:18 -0500 Subject: [PATCH 3/8] re-word the docs --- pandas/io/sql.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index a4597558a8991..c178f5c5a7a64 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -647,22 +647,22 @@ def read_sql( Notes ----- Using string interpolation (e.g. ``f-strings``, ``%-formatting``, - ``str.format()``, etc.) in a SQL query may cause SQL injection. + ``str.format()``, etc.) in a SQL query may allow SQL injection + if query inputs aren't sanitized. For example, the code below will insert unexpected data into ``test_data`` table. - >>> from sqlite3 import connect >>> from sqlalchemy import create_engine - >>> engine = create_engine('postgresql:///test_db') - >>> conn = engine.connect() + >>> engine = create_engine('postgresql:///test_db') # doctest:+SKIP + >>> conn = engine.connect() # doctest:+SKIP >>> df = pd.DataFrame(data=[[0, '10/11/12'], [1, '12/11/10']], ... columns=['int_column', 'date_column']) - >>> df.to_sql(name='test_data', con=conn) + >>> df.to_sql(name='test_data', con=conn) # doctest:+SKIP 2 >>> # DON'T DO THIS >>> query_int = "1; INSERT INTO test_data VALUES (2, 2, '09/11/12') RETURNING *;" - >>> pd.read_sql(f'SELECT * FROM test_data WHERE int_column={query_int}', conn) + >>> pd.read_sql(f'SELECT * FROM test_data WHERE int_column={query_int}', conn) # doctest:+SKIP index int_column date_column 0 2 2 09/11/12 >>> conn.commit() @@ -671,7 +671,7 @@ def read_sql( >>> from sqlalchemy import text >>> sql = text('SELECT * FROM test_data WHERE int_column=:int_val') - >>> pd.read_sql(sql, conn, params={'int_val': 1}) + >>> pd.read_sql(sql, conn, params={'int_val': 1}) # doctest:+SKIP index int_column date_column 0 1 1 12/11/10 From 4ec7ff1090b25ab14b4a7179181e26e224bc3e71 Mon Sep 17 00:00:00 2001 From: erichxchen Date: Tue, 26 Dec 2023 16:12:55 -0500 Subject: [PATCH 4/8] formatting --- pandas/io/sql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c178f5c5a7a64..7bfa14eb720d0 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -647,7 +647,7 @@ def read_sql( Notes ----- Using string interpolation (e.g. ``f-strings``, ``%-formatting``, - ``str.format()``, etc.) in a SQL query may allow SQL injection + ``str.format()``, etc.) in a SQL query may allow SQL injection if query inputs aren't sanitized. For example, the code below will insert unexpected data into ``test_data`` table. From 1a639d45816f2a90ab57acc566581563c01336fa Mon Sep 17 00:00:00 2001 From: erichxchen Date: Tue, 26 Dec 2023 16:24:48 -0500 Subject: [PATCH 5/8] refactor the example due to the length constraints --- pandas/io/sql.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 7bfa14eb720d0..a2538e6b0b1e7 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -662,7 +662,8 @@ def read_sql( >>> # DON'T DO THIS >>> query_int = "1; INSERT INTO test_data VALUES (2, 2, '09/11/12') RETURNING *;" - >>> pd.read_sql(f'SELECT * FROM test_data WHERE int_column={query_int}', conn) # doctest:+SKIP + >>> sql = f'SELECT * FROM test_data WHERE int_column={query_int}' + >>> pd.read_sql(sql, conn) # doctest:+SKIP index int_column date_column 0 2 2 09/11/12 >>> conn.commit() From 7018cb112f9f4c77877cd2e4d05e12bccc4bb9d1 Mon Sep 17 00:00:00 2001 From: erichxchen Date: Tue, 2 Jan 2024 01:37:45 -0500 Subject: [PATCH 6/8] remove reference directing users to use params --- pandas/io/sql.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index a2538e6b0b1e7..995faea508ec6 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -668,14 +668,6 @@ def read_sql( 0 2 2 09/11/12 >>> conn.commit() - Instead, use the ``params`` argument: - - >>> from sqlalchemy import text - >>> sql = text('SELECT * FROM test_data WHERE int_column=:int_val') - >>> pd.read_sql(sql, conn, params={'int_val': 1}) # doctest:+SKIP - index int_column date_column - 0 1 1 12/11/10 - Examples -------- Read data from SQL via either a SQL query or a SQL tablename. From 8221d94fe5f8f39f779d2898e2b6fc4e705b820c Mon Sep 17 00:00:00 2001 From: erichxchen Date: Sat, 13 Jan 2024 20:06:15 -0500 Subject: [PATCH 7/8] changed to the general warning --- pandas/io/sql.py | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 995faea508ec6..c9cfe9e22e02f 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -646,27 +646,11 @@ def read_sql( Notes ----- - Using string interpolation (e.g. ``f-strings``, ``%-formatting``, - ``str.format()``, etc.) in a SQL query may allow SQL injection - if query inputs aren't sanitized. - For example, the code below will insert unexpected data into ``test_data`` table. - - >>> from sqlalchemy import create_engine - >>> engine = create_engine('postgresql:///test_db') # doctest:+SKIP - >>> conn = engine.connect() # doctest:+SKIP - - >>> df = pd.DataFrame(data=[[0, '10/11/12'], [1, '12/11/10']], - ... columns=['int_column', 'date_column']) - >>> df.to_sql(name='test_data', con=conn) # doctest:+SKIP - 2 - - >>> # DON'T DO THIS - >>> query_int = "1; INSERT INTO test_data VALUES (2, 2, '09/11/12') RETURNING *;" - >>> sql = f'SELECT * FROM test_data WHERE int_column={query_int}' - >>> pd.read_sql(sql, conn) # doctest:+SKIP - index int_column date_column - 0 2 2 09/11/12 - >>> conn.commit() + ``pandas`` does not attempt to sanitize SQL statements; + instead it simply forwards the statement you are executing + to the underlying driver, which may or may not sanitize from there. + Please refer to the underlying driver documentation for any details. + Generally, be wary when accepting statements from arbitrary sources. Examples -------- From ef5f9f0f76849446192127127d7d1c95dd399466 Mon Sep 17 00:00:00 2001 From: erichxchen Date: Wed, 17 Jan 2024 01:20:41 -0500 Subject: [PATCH 8/8] add the example for using params --- pandas/io/sql.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index bd7a91f3936ba..d9a5e6dfd0cf8 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -680,6 +680,14 @@ def read_sql( >>> pd.read_sql('test_data', 'postgres:///db_name') # doctest:+SKIP + For parameterized query, using ``params`` is recommended over string interpolation. + + >>> from sqlalchemy import text + >>> sql = text('SELECT int_column, date_column FROM test_data WHERE int_column=:int_val') + >>> pd.read_sql(sql, conn, params={'int_val': 1}) # doctest:+SKIP + int_column date_column + 0 1 12/11/10 + Apply date parsing to columns through the ``parse_dates`` argument The ``parse_dates`` argument calls ``pd.to_datetime`` on the provided columns. Custom argument values for applying ``pd.to_datetime`` on a column are specified @@ -702,7 +710,7 @@ def read_sql( int_column 0 0 1 1 - """ + """ # noqa: E501 check_dtype_backend(dtype_backend) if dtype_backend is lib.no_default: