Merge pull request #1455 from MIT-LCP/bm/m4-sqlite-no-sqlalchemy

alistairewj · web-flow · commit 0f8982268a9e · 2022-12-23T22:11:54.000-05:00
Avoid unnecessary dependency on sqlalchemy
diff --git a/.github/workflows/sqlite.yml b/.github/workflows/sqlite.yml
@@ -0,0 +1,39 @@
+name: sqlite demo db build
+on:
+  pull_request_review:
+    types: [submitted]
+
+jobs:
+  mimic-iv-sqlite:
+    # only run if PR is approved
+    if: github.event.review.state == 'approved'
+    runs-on: ubuntu-20.04
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Python dependencies
+        run: |
+          pip install pandas
+
+      - name: Download demo data
+        uses: ./.github/actions/download-demo
+        with:
+            gcp-project-id: ${{ secrets.GCP_PROJECT_ID }}
+            gcp-sa-key: ${{ secrets.GCP_SA_KEY }}
+
+      - name: Load icu/hosp data into SQLite
+        run: |
+          echo "Running SQLite build."
+          python ${BUILDCODE_PATH}/import.py
+
+          echo `md5sum mimic4.db`
+
+        env:
+          BUILDCODE_PATH: mimic-iv/buildmimic/sqlite
diff --git a/mimic-iv/buildmimic/sqlite/README.md b/mimic-iv/buildmimic/sqlite/README.md
@@ -15,9 +15,7 @@ into memory. It only needs three things to run:
 `import.py` is a python script. It requires the following to run:
 
 1. Python 3 installed
-2. SQLite
-3. [pandas](https://pandas.pydata.org/)
-4. [sqlalchemy](https://www.sqlalchemy.org/)
+2. [pandas](https://pandas.pydata.org/)
 
 ## Step 1: Download the CSV or CSV.GZ files.
 
diff --git a/mimic-iv/buildmimic/sqlite/import.py b/mimic-iv/buildmimic/sqlite/import.py
@@ -1,4 +1,5 @@
 import os
+import sqlite3
 import sys
 
 from glob import glob
@@ -7,28 +8,28 @@
 DATABASE_NAME = "mimic4.db"
 THRESHOLD_SIZE = 5 * 10**7
 CHUNKSIZE = 10**6
-CONNECTION_STRING = "sqlite:///{}".format(DATABASE_NAME)
 
 if os.path.exists(DATABASE_NAME):
     msg = "File {} already exists.".format(DATABASE_NAME)
     print(msg)
     sys.exit()
 
-for f in glob("**/*.csv*", recursive=True):
-    print("Starting processing {}".format(f))
-    folder, filename = os.path.split(f)
-    tablename = filename.lower()
-    if tablename.endswith('.gz'):
-        tablename = tablename[:-3]
-    if tablename.endswith('.csv'):
-        tablename = tablename[:-4]
-    if os.path.getsize(f) < THRESHOLD_SIZE:
-        df = pd.read_csv(f)
-        df.to_sql(tablename, CONNECTION_STRING)
-    else:
-        # If the file is too large, let's do the work in chunks
-        for chunk in pd.read_csv(f, chunksize=CHUNKSIZE, low_memory=False):
-            chunk.to_sql(tablename, CONNECTION_STRING, if_exists="append")
-    print("Finished processing {}".format(f))
+with sqlite3.Connection(DATABASE_NAME) as connection:
+    for f in glob("**/*.csv*", recursive=True):
+        print("Starting processing {}".format(f))
+        folder, filename = os.path.split(f)
+        tablename = filename.lower()
+        if tablename.endswith('.gz'):
+            tablename = tablename[:-3]
+        if tablename.endswith('.csv'):
+            tablename = tablename[:-4]
+        if os.path.getsize(f) < THRESHOLD_SIZE:
+            df = pd.read_csv(f)
+            df.to_sql(tablename, connection)
+        else:
+            # If the file is too large, let's do the work in chunks
+            for chunk in pd.read_csv(f, chunksize=CHUNKSIZE, low_memory=False):
+                chunk.to_sql(tablename, connection, if_exists="append")
+        print("Finished processing {}".format(f))
 
 print("Should be all done!")