|
34 | 34 |
|
35 | 35 | from fastapi.encoders import jsonable_encoder |
36 | 36 | from sqlalchemy import select |
37 | | -from sqlalchemy.orm import lazyload |
| 37 | +from sqlalchemy.orm import lazyload, Session |
38 | 38 |
|
39 | 39 | from mavedb.lib.score_sets import get_score_set_counts_as_csv, get_score_set_scores_as_csv |
40 | | -from mavedb.lib.script_environment import init_script_environment |
41 | 40 | from mavedb.models.experiment import Experiment |
42 | 41 | from mavedb.models.experiment_set import ExperimentSet |
43 | 42 | from mavedb.models.license import License |
44 | 43 | from mavedb.models.score_set import ScoreSet |
45 | 44 | from mavedb.view_models.experiment_set import ExperimentSetPublicDump |
46 | 45 |
|
47 | | -db = init_script_environment() |
| 46 | +from mavedb.scripts.environment import script_environment, with_database_session |
48 | 47 |
|
49 | 48 | logger = logging.getLogger(__name__) |
50 | 49 |
|
@@ -89,68 +88,73 @@ def flatmap(f: Callable[[S], Iterable[T]], items: Iterable[S]) -> Iterable[T]: |
89 | 88 | return chain.from_iterable(map(f, items)) |
90 | 89 |
|
91 | 90 |
|
92 | | -logger.info("Fetching data sets") |
93 | | - |
94 | | -experiment_sets_query = db.scalars( |
95 | | - select(ExperimentSet) |
96 | | - .where(ExperimentSet.published_date.is_not(None)) |
97 | | - .options( |
98 | | - lazyload(ExperimentSet.experiments.and_(Experiment.published_date.is_not(None))).options( |
99 | | - lazyload( |
100 | | - Experiment.score_sets.and_( |
101 | | - ScoreSet.published_date.is_not(None), ScoreSet.license.has(License.short_name == "CC0") |
| 91 | +@script_environment.command() |
| 92 | +@with_database_session |
| 93 | +def export_public_data(db: Session): |
| 94 | + experiment_sets_query = db.scalars( |
| 95 | + select(ExperimentSet) |
| 96 | + .where(ExperimentSet.published_date.is_not(None)) |
| 97 | + .options( |
| 98 | + lazyload(ExperimentSet.experiments.and_(Experiment.published_date.is_not(None))).options( |
| 99 | + lazyload( |
| 100 | + Experiment.score_sets.and_( |
| 101 | + ScoreSet.published_date.is_not(None), ScoreSet.license.has(License.short_name == "CC0") |
| 102 | + ) |
102 | 103 | ) |
103 | 104 | ) |
104 | 105 | ) |
| 106 | + .execution_options(populate_existing=True) |
| 107 | + .order_by(ExperimentSet.urn) |
| 108 | + ) |
| 109 | + |
| 110 | + # Filter the stream of experiment sets to exclude experiments and experiment sets with no public, CC0-licensed score |
| 111 | + # sets. |
| 112 | + experiment_sets = list(filter_experiment_sets(experiment_sets_query.all())) |
| 113 | + |
| 114 | + # TODO To support very large data sets, we may want to use custom code for JSON-encoding an iterator. |
| 115 | + # Issue: https://github.com/VariantEffect/mavedb-api/issues/192 |
| 116 | + # See, for instance, https://stackoverflow.com/questions/12670395/json-encoding-very-long-iterators. |
| 117 | + |
| 118 | + experiment_set_views = list(map(lambda es: ExperimentSetPublicDump.from_orm(es), experiment_sets)) |
| 119 | + |
| 120 | + # Get a list of IDS of all the score sets included. |
| 121 | + score_set_ids = list( |
| 122 | + flatmap(lambda es: flatmap(lambda e: map(lambda ss: ss.id, e.score_sets), es.experiments), experiment_sets) |
105 | 123 | ) |
106 | | - .execution_options(populate_existing=True) |
107 | | - .order_by(ExperimentSet.urn) |
108 | | -) |
109 | | - |
110 | | -# Filter the stream of experiment sets to exclude experiments and experiment sets with no public, CC0-licensed score |
111 | | -# sets. |
112 | | -experiment_sets = list(filter_experiment_sets(experiment_sets_query.all())) |
113 | | - |
114 | | -# TODO To support very large data sets, we may want to use custom code for JSON-encoding an iterator. |
115 | | -# Issue: https://github.com/VariantEffect/mavedb-api/issues/192 |
116 | | -# See, for instance, https://stackoverflow.com/questions/12670395/json-encoding-very-long-iterators. |
117 | | - |
118 | | -experiment_set_views = list(map(lambda es: ExperimentSetPublicDump.from_orm(es), experiment_sets)) |
119 | | - |
120 | | -# Get a list of IDS of all the score sets included. |
121 | | -score_set_ids = list( |
122 | | - flatmap(lambda es: flatmap(lambda e: map(lambda ss: ss.id, e.score_sets), es.experiments), experiment_sets) |
123 | | -) |
124 | | - |
125 | | -timestamp_format = "%Y%m%d%H%M%S" |
126 | | -zip_file_name = f"mavedb-dump.{datetime.now().strftime(timestamp_format)}.zip" |
127 | | - |
128 | | -logger.info(f"Exporting public data set metadata to {zip_file_name}/main.json") |
129 | | -json_data = { |
130 | | - "title": "MaveDB public data", |
131 | | - "asOf": datetime.now(timezone.utc).isoformat(), |
132 | | - "experimentSets": experiment_set_views, |
133 | | -} |
134 | | - |
135 | | -with ZipFile(zip_file_name, "w") as zipfile: |
136 | | - # Write metadata for all data sets to a single JSON file. |
137 | | - zipfile.writestr("main.json", json.dumps(jsonable_encoder(json_data))) |
138 | | - |
139 | | - # Copy the CC0 license. |
140 | | - zipfile.write(os.path.join(os.path.dirname(__file__), "resources/CC0_license.txt"), "LICENSE.txt") |
141 | | - |
142 | | - # Write score and count files for each score set. |
143 | | - num_score_sets = len(score_set_ids) |
144 | | - for i, score_set_id in enumerate(score_set_ids): |
145 | | - score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one_or_none() |
146 | | - if score_set is not None and score_set.urn is not None: |
147 | | - logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}") |
148 | | - csv_filename_base = score_set.urn.replace(":", "-") |
149 | | - |
150 | | - csv_str = get_score_set_scores_as_csv(db, score_set) |
151 | | - zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str) |
152 | | - |
153 | | - count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None |
154 | | - if count_columns and len(count_columns) > 0: |
155 | | - csv_str = get_score_set_counts_as_csv(db, score_set) |
156 | | - zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str) |
| 124 | + |
| 125 | + timestamp_format = "%Y%m%d%H%M%S" |
| 126 | + zip_file_name = f"mavedb-dump.{datetime.now().strftime(timestamp_format)}.zip" |
| 127 | + |
| 128 | + logger.info(f"Exporting public data set metadata to {zip_file_name}/main.json") |
| 129 | + json_data = { |
| 130 | + "title": "MaveDB public data", |
| 131 | + "asOf": datetime.now(timezone.utc).isoformat(), |
| 132 | + "experimentSets": experiment_set_views, |
| 133 | + } |
| 134 | + |
| 135 | + with ZipFile(zip_file_name, "w") as zipfile: |
| 136 | + # Write metadata for all data sets to a single JSON file. |
| 137 | + zipfile.writestr("main.json", json.dumps(jsonable_encoder(json_data))) |
| 138 | + |
| 139 | + # Copy the CC0 license. |
| 140 | + zipfile.write(os.path.join(os.path.dirname(__file__), "resources/CC0_license.txt"), "LICENSE.txt") |
| 141 | + |
| 142 | + # Write score and count files for each score set. |
| 143 | + num_score_sets = len(score_set_ids) |
| 144 | + for i, score_set_id in enumerate(score_set_ids): |
| 145 | + score_set = db.scalars(select(ScoreSet).where(ScoreSet.id == score_set_id)).one_or_none() |
| 146 | + if score_set is not None and score_set.urn is not None: |
| 147 | + logger.info(f"{i + 1}/{num_score_sets} Exporting variants for score set {score_set.urn}") |
| 148 | + csv_filename_base = score_set.urn.replace(":", "-") |
| 149 | + |
| 150 | + csv_str = get_score_set_scores_as_csv(db, score_set) |
| 151 | + zipfile.writestr(f"csv/{csv_filename_base}.scores.csv", csv_str) |
| 152 | + |
| 153 | + count_columns = score_set.dataset_columns["count_columns"] if score_set.dataset_columns else None |
| 154 | + if count_columns and len(count_columns) > 0: |
| 155 | + csv_str = get_score_set_counts_as_csv(db, score_set) |
| 156 | + zipfile.writestr(f"csv/{csv_filename_base}.counts.csv", csv_str) |
| 157 | + |
| 158 | + |
| 159 | +if __name__ == "__main__": |
| 160 | + export_public_data() |
0 commit comments