Skip to content

Commit a183e27

Browse files
authored
Merge pull request #256 from VariantEffect/feature/bencap/251/variant-mapping-script
Variant Mapping Script (and script environment improvements)
2 parents 0dc3ca4 + c6c9d4c commit a183e27

File tree

5 files changed

+403
-91
lines changed

5 files changed

+403
-91
lines changed

src/mavedb/lib/script_environment.py

Lines changed: 0 additions & 25 deletions
This file was deleted.

src/mavedb/models/mapped_variant.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ class MappedVariant(Base):
1414

1515
id = Column(Integer, primary_key=True)
1616

17-
pre_mapped = Column(JSONB, nullable=True)
18-
post_mapped = Column(JSONB, nullable=True)
17+
pre_mapped = Column(JSONB(none_as_null=True), nullable=True)
18+
post_mapped = Column(JSONB(none_as_null=True), nullable=True)
1919
vrs_version = Column(String, nullable=True)
2020
error_message = Column(String, nullable=True)
2121
modification_date = Column(Date, nullable=False, default=date.today, onupdate=date.today)

src/mavedb/scripts/environment.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
"""
2+
Environment setup for scripts.
3+
"""
4+
5+
import enum
6+
import logging
7+
import click
8+
from functools import wraps
9+
10+
11+
from sqlalchemy.orm import configure_mappers
12+
13+
from mavedb import deps
14+
from mavedb.models import * # noqa: F403
15+
16+
17+
logger = logging.getLogger(__name__)
18+
19+
20+
@enum.unique
21+
class DatabaseSessionAction(enum.Enum):
22+
"""
23+
Enum representing the database session transaction action selected for a
24+
command decorated by :py:func:`.with_database_session`.
25+
26+
You will not need to use this class unless you provide ``pass_action =
27+
True`` to :py:func:`.with_database_session`.
28+
"""
29+
30+
DRY_RUN = "rollback"
31+
PROMPT = "prompt"
32+
COMMIT = "commit"
33+
34+
35+
@click.group()
36+
def script_environment():
37+
"""
38+
Set up the environment for a script that may be run from the command line and does not necessarily depend on the
39+
FastAPI framework.
40+
41+
Features:
42+
- Configures logging for the script.
43+
- Loads the SQLAlchemy data model.
44+
"""
45+
46+
logging.basicConfig()
47+
48+
# Un-comment this line to log all database queries:
49+
logging.getLogger("__main__").setLevel(logging.INFO)
50+
# logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO)
51+
52+
# Scan all our model classes and create backref attributes. Otherwise, these attributes only get added to classes once
53+
# an instance of the related class has been created.
54+
configure_mappers()
55+
56+
57+
def with_database_session(command=None, *, pass_action: bool = False):
58+
"""
59+
Decorator to provide database session and error handling for a *command*.
60+
61+
The *command* callable must be a :py:class:`click.Command` instance.
62+
63+
The decorated *command* is called with a ``db`` keyword argument to provide
64+
a :class:`~id3c.db.session.DatabaseSession` object. The call happens
65+
within an exception handler that commits or rollsback the database
66+
transaction, possibly interactively. Three new options are added to the
67+
*command* (``--dry-run``, ``--prompt``, and ``--commit``) to control this
68+
behaviour.
69+
70+
>>> @click.command
71+
... @with_database_session
72+
... def cmd(db: DatabaseSession):
73+
... pass
74+
75+
If the optional, keyword-only argument *pass_action* is ``True``, then the
76+
:py:class:`.DatabaseSessionAction` selected by the CLI options above is
77+
passed as an additional ``action`` argument to the decorated *command*.
78+
79+
>>> @click.command
80+
... @with_database_session(pass_action = True)
81+
... def cmd(db: DatabaseSession, action: DatabaseSessionAction):
82+
... pass
83+
84+
One example where this is useful is when the *command* accesses
85+
non-database resources and wants to extend dry run mode to them as well.
86+
"""
87+
88+
def decorator(command):
89+
@click.option(
90+
"--dry-run",
91+
"action",
92+
help="Only go through the motions of changing the database (default)",
93+
flag_value=DatabaseSessionAction("rollback"),
94+
type=DatabaseSessionAction,
95+
default=True,
96+
)
97+
@click.option(
98+
"--prompt",
99+
"action",
100+
help="Ask if changes to the database should be saved",
101+
flag_value=DatabaseSessionAction("prompt"),
102+
type=DatabaseSessionAction,
103+
)
104+
@click.option(
105+
"--commit",
106+
"action",
107+
help="Save changes to the database",
108+
flag_value=DatabaseSessionAction("commit"),
109+
type=DatabaseSessionAction,
110+
)
111+
@wraps(command)
112+
def decorated(*args, action, **kwargs):
113+
db = next(deps.get_db())
114+
115+
kwargs["db"] = db
116+
117+
if pass_action:
118+
kwargs["action"] = action
119+
120+
processed_without_error = None
121+
122+
try:
123+
command(*args, **kwargs)
124+
125+
except Exception as error:
126+
processed_without_error = False
127+
128+
logger.error(f"Aborting with error: {error}")
129+
raise error from None
130+
131+
else:
132+
processed_without_error = True
133+
134+
finally:
135+
if action is DatabaseSessionAction.PROMPT:
136+
ask_to_commit = (
137+
"Commit all changes?"
138+
if processed_without_error
139+
else "Commit successfully processed records up to this point?"
140+
)
141+
142+
commit = click.confirm(ask_to_commit)
143+
else:
144+
commit = action is DatabaseSessionAction.COMMIT
145+
146+
if commit:
147+
logger.info(
148+
"Committing all changes"
149+
if processed_without_error
150+
else "Committing successfully processed records up to this point"
151+
)
152+
db.commit()
153+
154+
else:
155+
logger.info("Rolling back all changes; the database will not be modified")
156+
db.rollback()
157+
158+
return decorated
159+
160+
return decorator(command) if command else decorator

src/mavedb/scripts/export_public_data.py

Lines changed: 68 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -34,17 +34,16 @@
3434

3535
from fastapi.encoders import jsonable_encoder
3636
from sqlalchemy import select
37-
from sqlalchemy.orm import lazyload
37+
from sqlalchemy.orm import lazyload, Session
3838

3939
from mavedb.lib.score_sets import get_score_set_counts_as_csv, get_score_set_scores_as_csv
40-
from mavedb.lib.script_environment import init_script_environment
4140
from mavedb.models.experiment import Experiment
4241
from mavedb.models.experiment_set import ExperimentSet
4342
from mavedb.models.license import License
4443
from mavedb.models.score_set import ScoreSet
4544
from mavedb.view_models.experiment_set import ExperimentSetPublicDump
4645

47-
db = init_script_environment()
46+
from mavedb.scripts.environment import script_environment, with_database_session
4847

4948
logger = logging.getLogger(__name__)
5049

@@ -89,68 +88,73 @@ def flatmap(f: Callable[[S], Iterable[T]], items: Iterable[S]) -> Iterable[T]:
8988
return chain.from_iterable(map(f, items))
9089

9190

92-
logger.info("Fetching data sets")
93-
94-
experiment_sets_query = db.scalars(
95-
select(ExperimentSet)
96-
.where(ExperimentSet.published_date.is_not(None))
97-
.options(
98-
lazyload(ExperimentSet.experiments.and_(Experiment.published_date.is_not(None))).options(
99-
lazyload(
100-
Experiment.score_sets.and_(
101-
ScoreSet.published_date.is_not(None), ScoreSet.license.has(License.short_name == "CC0")
91+
@script_environment.command()
92+
@with_database_session
93+
def export_public_data(db: Session):
94+
experiment_sets_query = db.scalars(
95+
select(ExperimentSet)
96+
.where(ExperimentSet.published_date.is_not(None))
97+
.options(
98+
lazyload(ExperimentSet.experiments.and_(Experiment.published_date.is_not(None))).options(
99+
lazyload(
100+
Experiment.score_sets.and_(
101+
ScoreSet.published_date.is_not(None), ScoreSet.license.has(License.short_name == "CC0")
102+
)
102103
)
103104
)
104105
)
106+
.execution_options(populate_existing=True)
107+
.order_by(ExperimentSet.urn)
108+
)
109+
110+
# Filter the stream of experiment sets to exclude experiments and experiment sets with no public, CC0-licensed score
111+
# sets.
112+
experiment_sets = list(filter_experiment_sets(experiment_sets_query.all()))
113+
114+
# TODO To support very large data sets, we may want to use custom code for JSON-encoding an iterator.
115+
# Issue: https://github.com/VariantEffect/mavedb-api/issues/192
116+
# See, for instance, https://stackoverflow.com/questions/12670395/json-encoding-very-long-iterators.
117+
118+
experiment_set_views = list(map(lambda es: ExperimentSetPublicDump.from_orm(es), experiment_sets))
119+
120+
# Get a list of IDS of all the score sets included.
121+
score_set_ids = list(
122+
flatmap(lambda es: flatmap(lambda e: map(lambda ss: ss.id, e.score_sets), es.experiments), experiment_sets)
105123
)
106-
.execution_options(populate_existing=True)
107-
.order_by(ExperimentSet.urn)
108-
)
109-
110-
# Filter the stream of experiment sets to exclude experiments and experiment sets with no public, CC0-licensed score
111-
# sets.
112-
experiment_sets = list(filter_experiment_sets(experiment_sets_query.all()))
113-
114-
# TODO To support very large data sets, we may want to use custom code for JSON-encoding an iterator.
115-
# Issue: https://github.com/VariantEffect/mavedb-api/issues/192
116-
# See, for instance, https://stackoverflow.com/questions/12670395/json-encoding-very-long-iterators.
117-
118-
experiment_set_views = list(map(lambda es: ExperimentSetPublicDump.from_orm(es), experiment_sets))
119-
120-
# Get a list of IDS of all the score sets included.
121-
score_set_ids = list(
122-
flatmap(lambda es: flatmap(lambda e: map(lambda ss: ss.id, e.score_sets), es.experiments), experiment_sets)
123-
)
124-
125-
timestamp_format = "%Y%m%d%H%M%S"
126-
zip_file_name = f"mavedb-dump.{datetime.now().strftime(timestamp_format)}.zip"
127-
128-
logger.info(f"Exporting public data set metadata to {zip_file_name}/main.json")
129-
json_data = {
130-
"title": "MaveDB public data",
131-
"asOf": datetime.now(timezone.utc).isoformat(),
132-
"experimentSets": experiment_set_views,
133-
}
134-
135-
with ZipFile(zip_file_name, "w") as zipfile:
136-
# Write metadata for all data sets to a single JSON file.
137-
zipfile.writestr("main.json", json.dumps(jsonable_encoder(json_data)))
138-
139-
# Copy the CC0 license.
140-
zipfile.write(os.path.join(os.path.dirname(__file__), "resources/CC0_license.txt"), "LICENSE.txt")
141-
142-
# Write score and count files for each score set.
143-
num_score_sets = len(score_set_ids)
144-
for i, score_set_id in enumerate(score_set_ids):
145-
score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one_or_none()
146-
if score_set is not None and score_set.urn is not None:
147-
logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}")
148-
csv_filename_base = score_set.urn.replace(":", "-")
149-
150-
csv_str = get_score_set_scores_as_csv(db, score_set)
151-
zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str)
152-
153-
count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None
154-
if count_columns and len(count_columns) > 0:
155-
csv_str = get_score_set_counts_as_csv(db, score_set)
156-
zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str)
124+
125+
timestamp_format = "%Y%m%d%H%M%S"
126+
zip_file_name = f"mavedb-dump.{datetime.now().strftime(timestamp_format)}.zip"
127+
128+
logger.info(f"Exporting public data set metadata to {zip_file_name}/main.json")
129+
json_data = {
130+
"title": "MaveDB public data",
131+
"asOf": datetime.now(timezone.utc).isoformat(),
132+
"experimentSets": experiment_set_views,
133+
}
134+
135+
with ZipFile(zip_file_name, "w") as zipfile:
136+
# Write metadata for all data sets to a single JSON file.
137+
zipfile.writestr("main.json", json.dumps(jsonable_encoder(json_data)))
138+
139+
# Copy the CC0 license.
140+
zipfile.write(os.path.join(os.path.dirname(__file__), "resources/CC0_license.txt"), "LICENSE.txt")
141+
142+
# Write score and count files for each score set.
143+
num_score_sets = len(score_set_ids)
144+
for i, score_set_id in enumerate(score_set_ids):
145+
score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one_or_none()
146+
if score_set is not None and score_set.urn is not None:
147+
logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}")
148+
csv_filename_base = score_set.urn.replace(":", "-")
149+
150+
csv_str = get_score_set_scores_as_csv(db, score_set)
151+
zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str)
152+
153+
count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None
154+
if count_columns and len(count_columns) > 0:
155+
csv_str = get_score_set_counts_as_csv(db, score_set)
156+
zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str)
157+
158+
159+
if __name__ == "__main__":
160+
export_public_data()

0 commit comments

Comments
 (0)