Skip to content

Commit 3517c3a

Browse files
authored
feat: Enable configurable data location (#291)
closes #269
1 parent 55efb94 commit 3517c3a

File tree

10 files changed

+215
-23
lines changed

10 files changed

+215
-23
lines changed

notebooks/evidence_matching/fusion_evidence_matching.ipynb

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -224,12 +224,17 @@
224224
"# Generate list of matches, report match score\n",
225225
"from fusor.fusion_matching import FusionMatcher\n",
226226
"from fusor.harvester import CIVICHarvester\n",
227+
"from fusor.config import config\n",
228+
"from fusor.models import save_fusions_cache\n",
227229
"\n",
228230
"# Generate categorical fusions list\n",
229231
"harvester = CIVICHarvester(fusor=fusor)\n",
230232
"harvester.fusions_list = variants\n",
231233
"civic_fusions = await harvester.load_records()\n",
232234
"\n",
235+
"# Save cache for later\n",
236+
"save_fusions_cache(civic_fusions, cache_dir=config.data_root, cache_name=\"civic_translated_fusions.pkl\")\n",
237+
"\n",
233238
"# Initialize FusionMatcher and define sources to match against\n",
234239
"fm = FusionMatcher(assayed_fusions=assayed_fusion_star_fusion,\n",
235240
" categorical_fusions=civic_fusions)\n",
@@ -513,6 +518,44 @@
513518
"fusions_list = await harvester.load_records(path)"
514519
]
515520
},
521+
{
522+
"cell_type": "markdown",
523+
"id": "062b8820",
524+
"metadata": {},
525+
"source": [
526+
"#### Load and standardize data from Molecular Oncology Almanac (MOA)"
527+
]
528+
},
529+
{
530+
"cell_type": "code",
531+
"execution_count": 12,
532+
"id": "92b5485c",
533+
"metadata": {},
534+
"outputs": [
535+
{
536+
"name": "stdout",
537+
"output_type": "stream",
538+
"text": [
539+
"Downloading v.2025-06-12.zip...\n"
540+
]
541+
},
542+
{
543+
"name": "stderr",
544+
"output_type": "stream",
545+
"text": [
546+
"519kB [00:00, 4.56MB/s]\n"
547+
]
548+
}
549+
],
550+
"source": [
551+
"from fusor.harvester import MOAHarvester\n",
552+
"harvester = MOAHarvester(fusor=fusor, cache_dir=config.data_root)\n",
553+
"moa_fusions = harvester.load_records()\n",
554+
"\n",
555+
"# Save cache for later\n",
556+
"save_fusions_cache(moa_fusions, cache_dir=config.data_root, cache_name=\"moa_translated_fusions.pkl\")"
557+
]
558+
},
516559
{
517560
"cell_type": "markdown",
518561
"id": "6ed39fe1",
@@ -524,7 +567,7 @@
524567
},
525568
{
526569
"cell_type": "code",
527-
"execution_count": 12,
570+
"execution_count": 13,
528571
"id": "79afb22e",
529572
"metadata": {},
530573
"outputs": [
@@ -569,10 +612,9 @@
569612
"from fusor.fusion_matching import FusionMatcher\n",
570613
"\n",
571614
"# Initialize FusionMatcher and define sources to match against. This time, we will use\n",
572-
"# the cache_files field, using the cached pickle files provided in the src/fusor/data \n",
573-
"# repo\n",
615+
"# the cache_files field, using the cached pickle files that have been saved\n",
574616
"fm = FusionMatcher(assayed_fusions=fusions_list,\n",
575-
" cache_dir=Path(\"../../src/fusor/data\"),\n",
617+
" cache_dir=config.data_root,\n",
576618
" cache_files=[\"civic_translated_fusions.pkl\", \"moa_translated_fusions.pkl\"])\n",
577619
"\n",
578620
"# Generate list of matching fusions\n",
@@ -592,7 +634,7 @@
592634
},
593635
{
594636
"cell_type": "code",
595-
"execution_count": 13,
637+
"execution_count": 14,
596638
"id": "af5e10b1",
597639
"metadata": {},
598640
"outputs": [],

src/fusor/config.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
"""Configure data storage variables for fusion objects"""
2+
3+
import os
4+
from pathlib import Path
5+
from typing import NamedTuple
6+
7+
from wags_tails.utils.storage import get_data_dir
8+
9+
10+
class _Config(NamedTuple):
11+
"""Define config data structure."""
12+
13+
data_root: Path
14+
15+
16+
def _get_configs() -> _Config:
17+
"""Fetch config values from environment.
18+
19+
Eventually this may be transformed into something using `pydantic-settings` but for
20+
now it just assembles a NamedTuple.
21+
22+
:return: constructed config object
23+
"""
24+
if env_var_data_dir := os.environ.get("FUSOR_DATA_DIR"):
25+
data_root_location = Path(env_var_data_dir)
26+
else:
27+
data_root_location = get_data_dir() / "fusor"
28+
return _Config(data_root=data_root_location)
29+
30+
31+
config = _get_configs()
-10.1 MB
Binary file not shown.
-401 KB
Binary file not shown.

src/fusor/fusion_matching.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import pickle
44
from pathlib import Path
55

6+
from fusor.config import config
67
from fusor.models import (
78
AssayedFusion,
89
CategoricalFusion,
@@ -27,8 +28,8 @@ def __init__(
2728
"""Initialize FusionMatcher class and comparator categorical fusion objects
2829
2930
:param cache_dir: The directory containing the cached categorical fusions
30-
files. If cached files do not exist in the directory, a cached file at
31-
the provided location will be generated for each source.
31+
files. If this parameter is not provided, it will be set by default
32+
to be `FUSOR_DATA_DIR`.
3233
:param assayed_fusions: A list of AssayedFusion objects
3334
:param categorical_fusions: A list of CategoricalFusion objects
3435
:param cache_files: A list of cache file names in ``cache_dir`` containing
@@ -40,6 +41,9 @@ def __init__(
4041
if not categorical_fusions and (not cache_dir or not cache_files):
4142
msg = "Either a list of CategoricalFusion objects must be provided to `categorical_fusions` or a Path and list of file names must be provided to `cache_dir` and `cache_files`, respectively"
4243
raise ValueError(msg)
44+
if not cache_dir:
45+
cache_dir = config.data_root
46+
cache_dir.mkdir(parents=True, exist_ok=True)
4347
self.cache_dir = cache_dir
4448
self.assayed_fusions = assayed_fusions
4549
self.cache_files = cache_files

src/fusor/harvester.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from cool_seq_tool.schemas import Assembly, CoordinateType
1313
from wags_tails import MoaData
1414

15+
from fusor.config import config
1516
from fusor.fusion_caller_models import (
1617
CIVIC,
1718
JAFFA,
@@ -336,14 +337,14 @@ def __init__(
336337
:param fusor: A FUSOR object
337338
:param cache_dir: The path to the store the cached MOA assertions.
338339
This by defualt is set to None, and the MOA assertions are
339-
stored in src/fusor/data
340+
stored in the `FUSOR_DATA_DIR` directory.
340341
:paran force_refresh: A boolean indicating if the MOA assertions
341342
file should be regenerated. By default, this is set to ``False``.
342343
"""
343344
self.translator = MOATranslator(fusor)
344345
if not cache_dir:
345-
cache_dir = Path(__file__).resolve().parent / "data"
346-
cache_dir.mkdir(parents=True, exist_ok=True)
346+
cache_dir = config.data_root
347+
cache_dir.mkdir(parents=True, exist_ok=True)
347348
moa_downloader = MoaData(data_dir=cache_dir)
348349
moa_file = moa_downloader.get_latest(force_refresh=force_refresh)[0]
349350
with moa_file.open("rb") as f:

src/fusor/models.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
model_validator,
2727
)
2828

29+
from fusor.config import config
30+
2931
_logger = logging.getLogger(__name__)
3032

3133
LINKER_REGEX = r"\|([atcg]+)\|"
@@ -953,19 +955,19 @@ class CategoricalFusion(AbstractFusion):
953955

954956
def save_fusions_cache(
955957
fusions_list: list[AssayedFusion | CategoricalFusion],
956-
cache_dir: Path,
957958
cache_name: str,
959+
cache_dir: Path | None = None,
958960
) -> None:
959961
"""Save a list of translated fusions as a cache
960962
961963
:param fusions_list: A list of FUSOR-translated fusions
962-
:param output_dir: The location to store the cached file. If this parameter is
963-
not supplied, it will default to creating a `data` directory under
964-
`src/fusor`
965964
:param cache_name: The name for the resultant cached file
965+
:param cache_dir: The location to store the cached file. If this parameter is
966+
not supplied, it will default to storing data in the `FUSOR_DATA_DIR`
967+
directory
966968
"""
967-
if not Path.is_dir(cache_dir):
968-
cache_dir = Path(__file__).resolve().parent / "data"
969+
if not cache_dir:
970+
cache_dir = config.data_root
969971
cache_dir.mkdir(parents=True, exist_ok=True)
970972
output_file = cache_dir / cache_name
971973
if output_file.exists():

tests/conftest.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,13 @@
66
import pytest
77
from cool_seq_tool.app import CoolSeqTool
88

9+
from fusor.config import config
910
from fusor.fusion_matching import FusionMatcher
1011
from fusor.fusor import FUSOR
1112

1213
FIXTURE_DATA_DIR = Path(__file__).parents[0].resolve() / "fixtures"
13-
CACHE_DATA_DIR = Path(__file__).resolve().parent.parent / "src" / "fusor" / "data"
14-
CACHE_DATA_DIR.mkdir(parents=True, exist_ok=True) # Create cache data directory
14+
CACHE_DATA_DIR = config.data_root
15+
CACHE_DATA_DIR.mkdir(parents=True, exist_ok=True)
1516

1617

1718
def pytest_addoption(parser):

tests/fusion_matching_test_cases.yaml

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,3 +189,115 @@ tests:
189189
- gene:
190190
name: "ABL1"
191191
score: 2
192+
- fields:
193+
type: "CategoricalFusion"
194+
structure:
195+
- gene:
196+
name: "BCR"
197+
- gene:
198+
name: "ABL1"
199+
score: 2
200+
- fields:
201+
type: "CategoricalFusion"
202+
structure:
203+
- gene:
204+
name: "BCR"
205+
- gene:
206+
name: "ABL1"
207+
score: 2
208+
- fields:
209+
type: "CategoricalFusion"
210+
structure:
211+
- gene:
212+
name: "BCR"
213+
- gene:
214+
name: "ABL1"
215+
score: 2
216+
- fields:
217+
type: "CategoricalFusion"
218+
structure:
219+
- gene:
220+
name: "BCR"
221+
- gene:
222+
name: "ABL1"
223+
score: 2
224+
- fields:
225+
type: "CategoricalFusion"
226+
structure:
227+
- gene:
228+
name: "BCR"
229+
- gene:
230+
name: "ABL1"
231+
score: 2
232+
- fields:
233+
type: "CategoricalFusion"
234+
structure:
235+
- gene:
236+
name: "BCR"
237+
- gene:
238+
name: "ABL1"
239+
score: 2
240+
- fields:
241+
type: "CategoricalFusion"
242+
structure:
243+
- gene:
244+
name: "BCR"
245+
- gene:
246+
name: "ABL1"
247+
score: 2
248+
- fields:
249+
type: "CategoricalFusion"
250+
structure:
251+
- gene:
252+
name: "BCR"
253+
- gene:
254+
name: "ABL1"
255+
score: 2
256+
- fields:
257+
type: "CategoricalFusion"
258+
structure:
259+
- gene:
260+
name: "BCR"
261+
- gene:
262+
name: "ABL1"
263+
score: 2
264+
- fields:
265+
type: "CategoricalFusion"
266+
structure:
267+
- gene:
268+
name: "BCR"
269+
- gene:
270+
name: "ABL1"
271+
score: 2
272+
- fields:
273+
type: "CategoricalFusion"
274+
structure:
275+
- gene:
276+
name: "BCR"
277+
- gene:
278+
name: "ABL1"
279+
score: 2
280+
- fields:
281+
type: "CategoricalFusion"
282+
structure:
283+
- gene:
284+
name: "BCR"
285+
- gene:
286+
name: "ABL1"
287+
score: 2
288+
- fields:
289+
type: "CategoricalFusion"
290+
structure:
291+
- gene:
292+
name: "BCR"
293+
- gene:
294+
name: "ABL1"
295+
score: 2
296+
- fields:
297+
type: "CategoricalFusion"
298+
structure:
299+
- gene:
300+
name: "BCR"
301+
- gene:
302+
name: "ABL1"
303+
score: 2

tests/test_models.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from cool_seq_tool.schemas import Strand
77
from pydantic import ValidationError
88

9+
from fusor.config import config
910
from fusor.models import (
1011
AbstractFusion,
1112
AnchoredReads,
@@ -1067,7 +1068,7 @@ def test_model_examples():
10671068
model(**schema["example"])
10681069

10691070

1070-
def test_save_cache(fixture_data_dir):
1071+
def test_save_cache():
10711072
"""Test cache saving functionality for AssayedFusion and CategoricalFusion
10721073
objects
10731074
"""
@@ -1081,15 +1082,13 @@ def test_save_cache(fixture_data_dir):
10811082
# Test AssayedFusion
10821083
save_fusions_cache(
10831084
fusions_list=[assayed_fusion],
1084-
cache_dir=Path(fixture_data_dir),
10851085
cache_name="assayed_cache_test.pkl",
10861086
)
1087-
assert Path.exists(fixture_data_dir / "assayed_cache_test.pkl")
1087+
assert Path.exists(config.data_root / "assayed_cache_test.pkl")
10881088

10891089
# Test CategoricalFusion
10901090
save_fusions_cache(
10911091
fusions_list=[categorical_fusion],
1092-
cache_dir=Path(fixture_data_dir),
10931092
cache_name="categorical_cache_test.pkl",
10941093
)
1095-
assert Path.exists(fixture_data_dir / "categorical_cache_test.pkl")
1094+
assert Path.exists(config.data_root / "categorical_cache_test.pkl")

0 commit comments

Comments
 (0)