datakind
diff --git a/‎.env.example‎
Lines changed: 9 additions & 5 deletions b/‎.env.example‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎.github/workflows/checks.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/checks.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎TODO.md‎
Lines changed: 2 additions & 0 deletions b/‎TODO.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎colandr/api/v1/routes/citation_imports.py‎
Lines changed: 22 additions & 1 deletion b/‎colandr/api/v1/routes/citation_imports.py‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎colandr/api/v1/routes/fulltext_uploads.py‎
Lines changed: 1 addition & 1 deletion b/‎colandr/api/v1/routes/fulltext_uploads.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎colandr/api/v1/routes/reviews.py‎
Lines changed: 8 additions & 2 deletions b/‎colandr/api/v1/routes/reviews.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎colandr/api/v1/routes/studies.py‎
Lines changed: 4 additions & 2 deletions b/‎colandr/api/v1/routes/studies.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎colandr/cli.py‎
Lines changed: 1 addition & 1 deletion b/‎colandr/cli.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎colandr/config.py‎
Lines changed: 6 additions & 6 deletions b/‎colandr/config.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎colandr/lib/models/__init__.py‎
Lines changed: 0 additions & 1 deletion b/‎colandr/lib/models/__init__.py‎
Lines changed: 0 additions & 1 deletion
@@ -16,11 +16,15 @@ COLANDR_MAIL_USE_TLS=0
 COLANDR_MAIL_USE_SSL=0
 # filesystem storage
 COLANDR_APP_DIR="/app"
-COLANDR_FILESYSTEM_PROTOCOL="file"  # in prod, use "gcs"
-COLANDR_FILESYSTEM_ROOT_DIR="/tmp"  # in prod, use "<GCS_BUCKET>"
-COLANDR_FILESYSTEM_GCS_PROJECT="<GCS_PROJECT>"
-COLANDR_FILESYSTEM_GCS_TOKEN="<GCS_TOKEN>"
-COLANDR_FILESYSTEM_GCS_ENDPOINT_URL="http://colandr-gcs:4443"  # dev-only!
+# file-based filesystem
+COLANDR_FILESYSTEM_PROTOCOL="file"
+COLANDR_FILESYSTEM_ROOT_DIR="/app/data"
+COLANDR_FILESYSTEM_GCS_ENDPOINT_URL="http://colandr-gcs:4443"  # dev-only hack for gcs
+# gcs-based filesystem
+# COLANDR_FILESYSTEM_PROTOCOL="gcs"
+# COLANDR_FILESYSTEM_ROOT_DIR="<GCS_BUCKET>"
+# COLANDR_FILESYSTEM_GCS_PROJECT="<GCS_PROJECT>"
+# COLANDR_FILESYSTEM_GCS_TOKEN="<GCS_TOKEN>"
 # metadata extraction config
 COLANDR_METADATA_THRESHOLD=0.65
 COLANDR_METADATA_INCREASE_TO_RETRAIN=5
 
@@ -17,6 +17,7 @@ env:
   COLANDR_DATABASE_URI: "postgresql+psycopg://colandr_app:password@localhost:5432/colandr"
   COLANDR_SECRET_KEY: "colandr_secret_key"
   COLANDR_APP_DIR: "/tmp"
+  COLANDR_FILESYSTEM_ROOT_DIR: "/tmp/data"
   COLANDR_MAIL_USERNAME: "colandr_mail_username"
   COLANDR_MAIL_PASSWORD: "colandr_mail_password"
   # COLANDR_FILESYSTEM_GCS_ENDPOINT_URL: "http://colandr-gcs:4443"
 
@@ -4,3 +4,5 @@
 - [ ] Improve and extend NLP functionality, just across the board
 - [ ] Enable https everywhere (via [let's encrypt](https://letsencrypt.org/)?)
 - [ ] Consider removing async tasks (and db columns?) for text content vectors
+- [ ] Add script to train a v2 (splink-based) deduper model
+- [ ] Add notebook demonstrating functionality of various NLP models
@@ -1,3 +1,4 @@
+import io
 import os
 import pathlib
 import typing as t
@@ -163,9 +164,13 @@ def post(self, files_data, query_data):
                 message=f"received invalid file type for citation import: '{fext}'"
             )
 
+        # unfortunately, we need to read the full file into memory rather than streaming
+        # so we can preprocess the citations and later save the raw file to disk
+        uploaded_data = uploaded_file.stream.read()
+
         try:
             citations_to_insert = _preprocess_citations(
-                uploaded_file.stream, fname, review_id
+                io.BytesIO(uploaded_data), fname, review_id
             )
         except ValueError as e:
             current_app.logger.exception(str(e))
@@ -216,6 +221,22 @@ def post(self, files_data, query_data):
             fname,
             review,
         )
+
+        fs = current_app.extensions["filesystem"]
+        # assign filename based an id, and full path
+        filename = f"{citations_import.id}{fext}"
+        filepath = os.path.join(
+            current_app.config["CITATION_UPLOADS_DIR"],
+            f"review_{review_id:08}",
+            filename,
+        )
+        # make review directory if doesn't already exist
+        fs.makedirs(os.path.dirname(filepath), exist_ok=True)
+        # save content to file on filesystem
+        with fs.open(filepath, mode="wb") as f:
+            # uploaded_file.save(f) may also work well
+            f.write(uploaded_data)
+
         # lastly, don't forget to deduplicate the citations and get their word2vecs
         tasks.get_citations_text_content_vectors.apply_async(
             args=[review_id], countdown=3
 
@@ -155,14 +155,14 @@ def post(self, id, files_data):
                 message=f'invalid fulltext upload file type: "{ext}"'
             )
 
+        fs = current_app.extensions["filesystem"]
         # assign filename based an id, and full path
         filename = f"{id}{ext}"
         filepath = os.path.join(
             current_app.config["FULLTEXT_UPLOADS_DIR"],
             str(study.review_id),
             filename,
         )
-        fs = current_app.extensions["filesystem"]
         # make review directory if doesn't already exist
         fs.makedirs(os.path.dirname(filepath), exist_ok=True)
         # save content to file on filesystem
 
@@ -73,7 +73,8 @@ def delete(self, id):
         # remove directories on disk for review data
         dirnames = [
             os.path.join(current_app.config["FULLTEXT_UPLOADS_DIR"], str(id)),
-            os.path.join(current_app.config["RANKING_MODELS_DIR"], str(id)),
+            os.path.join(current_app.config["CITATION_UPLOADS_DIR"], f"review_{id:08}"),
+            os.path.join(current_app.config["RANKER_MODELS_DIR"], f"review_{id:08}"),
         ]
         for dirname in dirnames:
             shutil.rmtree(dirname, ignore_errors=True)
@@ -171,7 +172,12 @@ def post(self, json_data):
         # create directories on disk for review data
         dirnames = [
             os.path.join(current_app.config["FULLTEXT_UPLOADS_DIR"], str(review.id)),
-            os.path.join(current_app.config["RANKING_MODELS_DIR"], str(review.id)),
+            os.path.join(
+                current_app.config["CITATION_UPLOADS_DIR"], f"review_{review.id:08}"
+            ),
+            os.path.join(
+                current_app.config["RANKER_MODELS_DIR"], f"review_{review.id:08}"
+            ),
         ]
         for dirname in dirnames:
             try:
 
@@ -395,9 +395,11 @@ def get(self, query_data):
 
             # best option: we have a trained study ranker model
             study_ranker = StudyRanker(
-                review_id, current_app.config["RANKER_MODELS_DIR"]
+                review_id,
+                current_app.config["RANKER_MODELS_DIR"],
+                current_app.extensions["filesystem"],
             )
-            if study_ranker.model_fpath.exists():
+            if study_ranker.model_exists:
                 records = (
                     {
                         "text": (
 
@@ -95,7 +95,7 @@ def reset_db():
     current_app.logger.warning("resetting database ...")
     db.drop_all()
     db.create_all()
-    for dirkey in ("FULLTEXT_UPLOADS_DIR", "RANKING_MODELS_DIR"):
+    for dirkey in ("FULLTEXT_UPLOADS_DIR", "CITATION_UPLOADS_DIR", "RANKER_MODELS_DIR"):
         shutil.rmtree(current_app.config[dirkey], ignore_errors=True)
         os.makedirs(current_app.config[dirkey], exist_ok=True)
 
 
@@ -85,17 +85,17 @@
         "cache_timeout": 3600,
     },
 }
-FILESYSTEM_ROOT_DIR = os.environ.get("COLANDR_FILESYSTEM_ROOT_DIR", "/tmp")
-FULLTEXT_UPLOADS_DIR = os.path.join(FILESYSTEM_ROOT_DIR, "colandr_data", "fulltexts")
+FILESYSTEM_ROOT_DIR = os.environ.get("COLANDR_FILESYSTEM_ROOT_DIR", "/app/data")
+RANKER_MODELS_DIR = os.path.join(FILESYSTEM_ROOT_DIR, "ranker_models")
+CITATION_UPLOADS_DIR = os.path.join(FILESYSTEM_ROOT_DIR, "citations")
+FULLTEXT_UPLOADS_DIR = os.path.join(FILESYSTEM_ROOT_DIR, "fulltexts")
 ALLOWED_CITATION_UPLOAD_EXTENSIONS = {".ris", ".txt", ".bib", ".csv", ".tsv"}
 ALLOWED_FULLTEXT_UPLOAD_EXTENSIONS = {".txt", ".pdf"}
-# TODO: figure out root dir vs app dir
-COLANDR_APP_DIR = os.environ.get("COLANDR_APP_DIR", "/tmp")
+
+COLANDR_APP_DIR = os.environ.get("COLANDR_APP_DIR", "/app")
 DEDUPE_MODELS_DIR = os.path.join(
     COLANDR_APP_DIR, "colandr_data", "dedupe-v2", "model_202407"
 )
-RANKER_MODELS_DIR = os.path.join(COLANDR_APP_DIR, "colandr_data", "ranker_models")
-RANKING_MODELS_DIR = os.path.join(COLANDR_APP_DIR, "colandr_data", "ranking_models")
 
 # metadata extraction config
 METADATA_THRESHOLD = float(os.environ.get("COLANDR_METADATA_THRESHOLD", "0.65"))
 
@@ -1,3 +1,2 @@
 from .deduper_v2 import DeduperV2
-from .ranker import Ranker
 from .study_ranker import StudyRanker
Original file line number	Diff line number	Diff line change
`@@ -395,9 +395,11 @@ def get(self, query_data):`
`395`	`395`
`396`	`396`	`# best option: we have a trained study ranker model`
`397`	`397`	`study_ranker = StudyRanker(`
`398`		`- review_id, current_app.config["RANKER_MODELS_DIR"]`
	`398`	`+ review_id,`
	`399`	`+ current_app.config["RANKER_MODELS_DIR"],`
	`400`	`+ current_app.extensions["filesystem"],`
`399`	`401`	`)`
`400`		`- if study_ranker.model_fpath.exists():`
	`402`	`+ if study_ranker.model_exists:`
`401`	`403`	`records = (`
`402`	`404`	`{`
`403`	`405`	`"text": (`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,2 @@`
`1`	`1`	`from .deduper_v2 import DeduperV2`
`2`		`-from .ranker import Ranker`
`3`	`2`	`from .study_ranker import StudyRanker`