Skip to content

Commit bc1e08c

Browse files
authored
make user data persistent and standardized (#175)
* delete old notebooks * delete old models dir+artifacts * delete old pdfestrian test data artifacts * add todo comments to study ranker model this is unrelated to the current changes, i just want to commit it somewhere * delete old citations test data * delete old dedupe model artifacts * delete old train deduper script * docs: update todos deployment readme * refactor: move ranker classes for readability * remove old ranker model * tests: delete old ranker model tests * fix: use consistent ranker model dir * refactor: avoid hard-coded ranker col names * include bigrams in study ranker features * add sranker prop to get num texts learned and retrain from scratch every 100 texts learned * change+hide file handling in study ranker api * build: add persistent file data volume * update file storage dirs in app config * start storing raw citation upload files to storage * tests: update fs config in conftest * fix: create/remove fs dirs in cli+api * ci: add fs root dir to checks env * use filesystem in study ranker io * tests: update study ranker init calls
1 parent aeb6928 commit bc1e08c

33 files changed

+204
-818062
lines changed

.env.example

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,15 @@ COLANDR_MAIL_USE_TLS=0
1616
COLANDR_MAIL_USE_SSL=0
1717
# filesystem storage
1818
COLANDR_APP_DIR="/app"
19-
COLANDR_FILESYSTEM_PROTOCOL="file" # in prod, use "gcs"
20-
COLANDR_FILESYSTEM_ROOT_DIR="/tmp" # in prod, use "<GCS_BUCKET>"
21-
COLANDR_FILESYSTEM_GCS_PROJECT="<GCS_PROJECT>"
22-
COLANDR_FILESYSTEM_GCS_TOKEN="<GCS_TOKEN>"
23-
COLANDR_FILESYSTEM_GCS_ENDPOINT_URL="http://colandr-gcs:4443" # dev-only!
19+
# file-based filesystem
20+
COLANDR_FILESYSTEM_PROTOCOL="file"
21+
COLANDR_FILESYSTEM_ROOT_DIR="/app/data"
22+
COLANDR_FILESYSTEM_GCS_ENDPOINT_URL="http://colandr-gcs:4443" # dev-only hack for gcs
23+
# gcs-based filesystem
24+
# COLANDR_FILESYSTEM_PROTOCOL="gcs"
25+
# COLANDR_FILESYSTEM_ROOT_DIR="<GCS_BUCKET>"
26+
# COLANDR_FILESYSTEM_GCS_PROJECT="<GCS_PROJECT>"
27+
# COLANDR_FILESYSTEM_GCS_TOKEN="<GCS_TOKEN>"
2428
# metadata extraction config
2529
COLANDR_METADATA_THRESHOLD=0.65
2630
COLANDR_METADATA_INCREASE_TO_RETRAIN=5

.github/workflows/checks.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ env:
1717
COLANDR_DATABASE_URI: "postgresql+psycopg://colandr_app:password@localhost:5432/colandr"
1818
COLANDR_SECRET_KEY: "colandr_secret_key"
1919
COLANDR_APP_DIR: "/tmp"
20+
COLANDR_FILESYSTEM_ROOT_DIR: "/tmp/data"
2021
COLANDR_MAIL_USERNAME: "colandr_mail_username"
2122
COLANDR_MAIL_PASSWORD: "colandr_mail_password"
2223
# COLANDR_FILESYSTEM_GCS_ENDPOINT_URL: "http://colandr-gcs:4443"

TODO.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,5 @@
44
- [ ] Improve and extend NLP functionality, just across the board
55
- [ ] Enable https everywhere (via [let's encrypt](https://letsencrypt.org/)?)
66
- [ ] Consider removing async tasks (and db columns?) for text content vectors
7+
- [ ] Add script to train a v2 (splink-based) deduper model
8+
- [ ] Add notebook demonstrating functionality of various NLP models

colandr/api/v1/routes/citation_imports.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import io
12
import os
23
import pathlib
34
import typing as t
@@ -163,9 +164,13 @@ def post(self, files_data, query_data):
163164
message=f"received invalid file type for citation import: '{fext}'"
164165
)
165166

167+
# unfortunately, we need to read the full file into memory rather than streaming
168+
# so we can preprocess the citations and later save the raw file to disk
169+
uploaded_data = uploaded_file.stream.read()
170+
166171
try:
167172
citations_to_insert = _preprocess_citations(
168-
uploaded_file.stream, fname, review_id
173+
io.BytesIO(uploaded_data), fname, review_id
169174
)
170175
except ValueError as e:
171176
current_app.logger.exception(str(e))
@@ -216,6 +221,22 @@ def post(self, files_data, query_data):
216221
fname,
217222
review,
218223
)
224+
225+
fs = current_app.extensions["filesystem"]
226+
# assign filename based an id, and full path
227+
filename = f"{citations_import.id}{fext}"
228+
filepath = os.path.join(
229+
current_app.config["CITATION_UPLOADS_DIR"],
230+
f"review_{review_id:08}",
231+
filename,
232+
)
233+
# make review directory if doesn't already exist
234+
fs.makedirs(os.path.dirname(filepath), exist_ok=True)
235+
# save content to file on filesystem
236+
with fs.open(filepath, mode="wb") as f:
237+
# uploaded_file.save(f) may also work well
238+
f.write(uploaded_data)
239+
219240
# lastly, don't forget to deduplicate the citations and get their word2vecs
220241
tasks.get_citations_text_content_vectors.apply_async(
221242
args=[review_id], countdown=3

colandr/api/v1/routes/fulltext_uploads.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,14 +155,14 @@ def post(self, id, files_data):
155155
message=f'invalid fulltext upload file type: "{ext}"'
156156
)
157157

158+
fs = current_app.extensions["filesystem"]
158159
# assign filename based an id, and full path
159160
filename = f"{id}{ext}"
160161
filepath = os.path.join(
161162
current_app.config["FULLTEXT_UPLOADS_DIR"],
162163
str(study.review_id),
163164
filename,
164165
)
165-
fs = current_app.extensions["filesystem"]
166166
# make review directory if doesn't already exist
167167
fs.makedirs(os.path.dirname(filepath), exist_ok=True)
168168
# save content to file on filesystem

colandr/api/v1/routes/reviews.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ def delete(self, id):
7373
# remove directories on disk for review data
7474
dirnames = [
7575
os.path.join(current_app.config["FULLTEXT_UPLOADS_DIR"], str(id)),
76-
os.path.join(current_app.config["RANKING_MODELS_DIR"], str(id)),
76+
os.path.join(current_app.config["CITATION_UPLOADS_DIR"], f"review_{id:08}"),
77+
os.path.join(current_app.config["RANKER_MODELS_DIR"], f"review_{id:08}"),
7778
]
7879
for dirname in dirnames:
7980
shutil.rmtree(dirname, ignore_errors=True)
@@ -171,7 +172,12 @@ def post(self, json_data):
171172
# create directories on disk for review data
172173
dirnames = [
173174
os.path.join(current_app.config["FULLTEXT_UPLOADS_DIR"], str(review.id)),
174-
os.path.join(current_app.config["RANKING_MODELS_DIR"], str(review.id)),
175+
os.path.join(
176+
current_app.config["CITATION_UPLOADS_DIR"], f"review_{review.id:08}"
177+
),
178+
os.path.join(
179+
current_app.config["RANKER_MODELS_DIR"], f"review_{review.id:08}"
180+
),
175181
]
176182
for dirname in dirnames:
177183
try:

colandr/api/v1/routes/studies.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -395,9 +395,11 @@ def get(self, query_data):
395395

396396
# best option: we have a trained study ranker model
397397
study_ranker = StudyRanker(
398-
review_id, current_app.config["RANKER_MODELS_DIR"]
398+
review_id,
399+
current_app.config["RANKER_MODELS_DIR"],
400+
current_app.extensions["filesystem"],
399401
)
400-
if study_ranker.model_fpath.exists():
402+
if study_ranker.model_exists:
401403
records = (
402404
{
403405
"text": (

colandr/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def reset_db():
9595
current_app.logger.warning("resetting database ...")
9696
db.drop_all()
9797
db.create_all()
98-
for dirkey in ("FULLTEXT_UPLOADS_DIR", "RANKING_MODELS_DIR"):
98+
for dirkey in ("FULLTEXT_UPLOADS_DIR", "CITATION_UPLOADS_DIR", "RANKER_MODELS_DIR"):
9999
shutil.rmtree(current_app.config[dirkey], ignore_errors=True)
100100
os.makedirs(current_app.config[dirkey], exist_ok=True)
101101

colandr/config.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -85,17 +85,17 @@
8585
"cache_timeout": 3600,
8686
},
8787
}
88-
FILESYSTEM_ROOT_DIR = os.environ.get("COLANDR_FILESYSTEM_ROOT_DIR", "/tmp")
89-
FULLTEXT_UPLOADS_DIR = os.path.join(FILESYSTEM_ROOT_DIR, "colandr_data", "fulltexts")
88+
FILESYSTEM_ROOT_DIR = os.environ.get("COLANDR_FILESYSTEM_ROOT_DIR", "/app/data")
89+
RANKER_MODELS_DIR = os.path.join(FILESYSTEM_ROOT_DIR, "ranker_models")
90+
CITATION_UPLOADS_DIR = os.path.join(FILESYSTEM_ROOT_DIR, "citations")
91+
FULLTEXT_UPLOADS_DIR = os.path.join(FILESYSTEM_ROOT_DIR, "fulltexts")
9092
ALLOWED_CITATION_UPLOAD_EXTENSIONS = {".ris", ".txt", ".bib", ".csv", ".tsv"}
9193
ALLOWED_FULLTEXT_UPLOAD_EXTENSIONS = {".txt", ".pdf"}
92-
# TODO: figure out root dir vs app dir
93-
COLANDR_APP_DIR = os.environ.get("COLANDR_APP_DIR", "/tmp")
94+
95+
COLANDR_APP_DIR = os.environ.get("COLANDR_APP_DIR", "/app")
9496
DEDUPE_MODELS_DIR = os.path.join(
9597
COLANDR_APP_DIR, "colandr_data", "dedupe-v2", "model_202407"
9698
)
97-
RANKER_MODELS_DIR = os.path.join(COLANDR_APP_DIR, "colandr_data", "ranker_models")
98-
RANKING_MODELS_DIR = os.path.join(COLANDR_APP_DIR, "colandr_data", "ranking_models")
9999

100100
# metadata extraction config
101101
METADATA_THRESHOLD = float(os.environ.get("COLANDR_METADATA_THRESHOLD", "0.65"))

colandr/lib/models/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
from .deduper_v2 import DeduperV2
2-
from .ranker import Ranker
32
from .study_ranker import StudyRanker

0 commit comments

Comments
 (0)