Skip to content

Commit b7275b9

Browse files
authored
Merge pull request #206 from datakind/improve-scaling-export-studies
improve scaling for export studies/screenings
2 parents 7d00fc0 + fe2f5a7 commit b7275b9

File tree

2 files changed

+28
-24
lines changed

2 files changed

+28
-24
lines changed

colandr/api/v1/routes/exports.py

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
import apiflask as af
77
import flask_jwt_extended as jwtext
88
import sqlalchemy as sa
9-
from flask import current_app, make_response
9+
import sqlalchemy.orm as sa_orm
10+
from flask import Response, current_app, make_response, stream_with_context
1011
from flask.views import MethodView
1112

1213
from .... import models
@@ -86,17 +87,17 @@ def get(self, query_data):
8687
else:
8788
extraction_label_types = None
8889

89-
# TODO: make this query performant and fully streamable, even with lazy-loading
90-
# see: https://docs.sqlalchemy.org/en/14/errors.html#parent-instance-x-is-not-bound-to-a-session-lazy-load-deferred-load-refresh-etc-operation-cannot-proceed
91-
# see: https://docs.sqlalchemy.org/en/14/errors.html#object-cannot-be-converted-to-persistent-state-as-this-identity-map-is-no-longer-valid
92-
studies = db.session.execute(
90+
stmt = (
9391
sa.select(models.Study)
9492
.filter_by(review_id=review_id)
95-
.order_by(models.Study.id),
96-
execution_options={"prebuffer_rows": True},
97-
).scalars()
98-
# rows = (_study_to_row(study, extraction_label_types) for study in studies)
99-
rows = [_study_to_row(study, extraction_label_types) for study in studies]
93+
.options(
94+
sa_orm.joinedload(models.Study.data_source),
95+
sa_orm.joinedload(models.Study.data_extraction),
96+
)
97+
.order_by(models.Study.id)
98+
)
99+
studies = db.session.execute(stmt).scalars().yield_per(1000)
100+
rows = (_study_to_row(study, extraction_label_types) for study in studies)
100101
if content_type == "text/csv":
101102
export_data = fileio.tabular.write_stream(
102103
fieldnames, rows, quoting=csv.QUOTE_NONNUMERIC
@@ -105,12 +106,13 @@ def get(self, query_data):
105106
# NOTE: this can't happen owing to input schema validation
106107
raise NotImplementedError("only 'text/csv' content type is available")
107108

108-
response = make_response(export_data, 200)
109-
response.headers.update(
110-
{
109+
response = Response(
110+
stream_with_context(export_data),
111+
status=200,
112+
headers={
111113
"Content-Type": content_type,
112114
"Content-Disposition": "attachment; filename=colandr-review-studies.csv",
113-
}
115+
},
114116
)
115117
current_app.logger.info("%s exported studies data for %s", current_user, review)
116118
return response
@@ -219,11 +221,13 @@ def get(self, query_data):
219221
if not review:
220222
raise errors.NotFoundError(message=f"<Review(id={review_id})> not found")
221223

222-
screenings = db.session.execute(
224+
stmt = (
223225
sa.select(models.Screening)
226+
.options(sa_orm.joinedload(models.Screening.user))
224227
.filter_by(review_id=review_id)
225228
.order_by(models.Screening.id)
226-
).scalars()
229+
)
230+
screenings = db.session.execute(stmt).scalars().yield_per(1000)
227231
fieldnames = [
228232
"study_id",
229233
"screening_stage",
@@ -232,8 +236,7 @@ def get(self, query_data):
232236
"user_email",
233237
"user_name",
234238
]
235-
# rows = (_screening_to_row(screening) for screening in screenings)
236-
rows = [_screening_to_row(screening) for screening in screenings]
239+
rows = (_screening_to_row(screening) for screening in screenings)
237240
if content_type == "text/csv":
238241
export_data = fileio.tabular.write_stream(
239242
fieldnames, rows, quoting=csv.QUOTE_NONNUMERIC
@@ -242,12 +245,13 @@ def get(self, query_data):
242245
# NOTE: this can't happen owing to input schema validation
243246
raise NotImplementedError("only 'text/csv' content type is available")
244247

245-
response = make_response(export_data, 200)
246-
response.headers.update(
247-
{
248+
response = Response(
249+
stream_with_context(export_data),
250+
status=200,
251+
headers={
248252
"Content-Type": content_type,
249253
"Content-Disposition": "attachment; filename=colandr-review-screenings.csv",
250-
}
254+
},
251255
)
252256
current_app.logger.info(
253257
"%s exported screenings data for %s", current_user, review

colandr/lib/fileio/tabular.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import itertools
44
import logging
55
import typing as t
6-
from collections.abc import Iterable, Sequence
6+
from collections.abc import Iterable, Iterator, Sequence
77

88

99
LOGGER = logging.getLogger(__name__)
@@ -32,7 +32,7 @@ def write_stream(
3232
*,
3333
dialect="excel",
3434
**kwargs,
35-
) -> Iterable[str]:
35+
) -> Iterator[str]:
3636
"""
3737
Write tabular data (rows x cols) in CSV format, in-memory, streaming row-by-row.
3838

0 commit comments

Comments
 (0)