sqlite/import.py: avoid dependence on sqlalchemy.

Benjamin Moody · Benjamin Moody · commit 7a98493a9ce5 · 2022-12-20T14:45:30.000-05:00
To import MIMIC-IV into SQLite, import.py uses Pandas both to parse
each data file (read_csv) and then to push the data into an SQL
database (to_sql).

The latter step can use an SQLAlchemy database connection for full
generality (which might be useful sometimes), but it can also simply
use an sqlite3.Connection created by the Python standard library.

Since this script is solely aimed at providing an easy way to get the
data into SQLite format, it's nice to avoid unnecessary dependencies.
diff --git a/mimic-iv/buildmimic/sqlite/README.md b/mimic-iv/buildmimic/sqlite/README.md
@@ -15,9 +15,7 @@ into memory. It only needs three things to run:
 `import.py` is a python script. It requires the following to run:
 
 1. Python 3 installed
-2. SQLite
-3. [pandas](https://pandas.pydata.org/)
-4. [sqlalchemy](https://www.sqlalchemy.org/)
+2. [pandas](https://pandas.pydata.org/)
 
 ## Step 1: Download the CSV or CSV.GZ files.
 
diff --git a/mimic-iv/buildmimic/sqlite/import.py b/mimic-iv/buildmimic/sqlite/import.py
@@ -1,4 +1,5 @@
 import os
+import sqlite3
 import sys
 
 from glob import glob
@@ -7,28 +8,28 @@
 DATABASE_NAME = "mimic4.db"
 THRESHOLD_SIZE = 5 * 10**7
 CHUNKSIZE = 10**6
-CONNECTION_STRING = "sqlite:///{}".format(DATABASE_NAME)
 
 if os.path.exists(DATABASE_NAME):
     msg = "File {} already exists.".format(DATABASE_NAME)
     print(msg)
     sys.exit()
 
-for f in glob("**/*.csv*", recursive=True):
-    print("Starting processing {}".format(f))
-    folder, filename = os.path.split(f)
-    tablename = filename.lower()
-    if tablename.endswith('.gz'):
-        tablename = tablename[:-3]
-    if tablename.endswith('.csv'):
-        tablename = tablename[:-4]
-    if os.path.getsize(f) < THRESHOLD_SIZE:
-        df = pd.read_csv(f)
-        df.to_sql(tablename, CONNECTION_STRING)
-    else:
-        # If the file is too large, let's do the work in chunks
-        for chunk in pd.read_csv(f, chunksize=CHUNKSIZE, low_memory=False):
-            chunk.to_sql(tablename, CONNECTION_STRING, if_exists="append")
-    print("Finished processing {}".format(f))
+with sqlite3.Connection(DATABASE_NAME) as connection:
+    for f in glob("**/*.csv*", recursive=True):
+        print("Starting processing {}".format(f))
+        folder, filename = os.path.split(f)
+        tablename = filename.lower()
+        if tablename.endswith('.gz'):
+            tablename = tablename[:-3]
+        if tablename.endswith('.csv'):
+            tablename = tablename[:-4]
+        if os.path.getsize(f) < THRESHOLD_SIZE:
+            df = pd.read_csv(f)
+            df.to_sql(tablename, connection)
+        else:
+            # If the file is too large, let's do the work in chunks
+            for chunk in pd.read_csv(f, chunksize=CHUNKSIZE, low_memory=False):
+                chunk.to_sql(tablename, connection, if_exists="append")
+        print("Finished processing {}".format(f))
 
 print("Should be all done!")