Skip to content

Commit 1c9f463

Browse files
authored
Merge pull request #185 from jkmckenna/0.2.5
0.2.5
2 parents ede41b7 + 0c77a49 commit 1c9f463

34 files changed

+909
-499
lines changed

docs/source/api/datasets.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,19 @@
66

77
```{eval-rst}
88
.. currentmodule:: smftools
9-
```
9+
```
10+
11+
```{eval-rst}
12+
.. autosummary::
13+
:toctree: generated/datasets
14+
:recursive:
15+
16+
smftools.datasets
17+
```
18+
19+
```{eval-rst}
20+
.. automodule:: smftools.datasets
21+
:members:
22+
:undoc-members:
23+
:show-inheritance:
24+
```

docs/source/api/informatics.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,21 @@
1515

1616
Processes raw sequencing data to load an adata object.
1717

18+
```{eval-rst}
19+
.. autosummary::
20+
:toctree: generated/informatics
21+
:recursive:
22+
23+
smftools.informatics
24+
```
25+
26+
```{eval-rst}
27+
.. automodule:: smftools.informatics
28+
:members:
29+
:undoc-members:
30+
:show-inheritance:
31+
```
32+
1833

1934
### Diagram of final steps of Direct SMF workflow
2035
```{image} ../_static/modkit_extract_to_adata.png

docs/source/api/preprocessing.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,18 @@
1212
```{eval-rst}
1313
.. currentmodule:: smftools
1414
```
15+
16+
```{eval-rst}
17+
.. autosummary::
18+
:toctree: generated/preprocessing
19+
:recursive:
20+
21+
smftools.preprocessing
22+
```
23+
24+
```{eval-rst}
25+
.. automodule:: smftools.preprocessing
26+
:members:
27+
:undoc-members:
28+
:show-inheritance:
29+
```

docs/source/api/tools.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,19 @@
66

77
```{eval-rst}
88
.. currentmodule:: smftools
9-
```
9+
```
10+
11+
```{eval-rst}
12+
.. autosummary::
13+
:toctree: generated/tools
14+
:recursive:
15+
16+
smftools.tools
17+
```
18+
19+
```{eval-rst}
20+
.. automodule:: smftools.tools
21+
:members:
22+
:undoc-members:
23+
:show-inheritance:
24+
```

src/smftools/datasets/datasets.py

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,42 @@
1-
## datasets
1+
"""Dataset helpers for bundled SMF datasets."""
22

3+
from __future__ import annotations
34

4-
def import_HERE():
5-
"""
6-
Imports HERE for loading datasets
7-
"""
8-
from pathlib import Path
5+
from pathlib import Path
6+
from typing import TYPE_CHECKING
97

10-
HERE = Path(__file__).parent
11-
return HERE
8+
if TYPE_CHECKING:
9+
import anndata as ad
1210

1311

14-
def dCas9_kinetics():
12+
def import_HERE() -> Path:
13+
"""Resolve the local dataset directory.
14+
15+
Returns:
16+
Path: Path to the datasets directory.
1517
"""
16-
in vitro Hia5 dCas9 kinetics SMF dataset. Nanopore HAC m6A modcalls.
18+
return Path(__file__).parent
19+
20+
21+
def dCas9_kinetics() -> "ad.AnnData":
22+
"""Load the in vitro Hia5 dCas9 kinetics SMF dataset.
23+
24+
Returns:
25+
anndata.AnnData: Annotated dataset with Nanopore HAC m6A modcalls.
1726
"""
1827
import anndata as ad
1928

20-
HERE = import_HERE()
21-
filepath = HERE / "dCas9_m6A_invitro_kinetics.h5ad.gz"
29+
filepath = import_HERE() / "dCas9_m6A_invitro_kinetics.h5ad.gz"
2230
return ad.read_h5ad(filepath)
2331

2432

25-
def Kissiov_and_McKenna_2025():
26-
"""
27-
F1 Hybrid M.CviPI natural killer cell SMF. Nanopore canonical calls of NEB EMseq converted SMF gDNA.
33+
def Kissiov_and_McKenna_2025() -> "ad.AnnData":
34+
"""Load the F1 Hybrid M.CviPI natural killer cell SMF dataset.
35+
36+
Returns:
37+
anndata.AnnData: Annotated dataset with canonical calls of NEB EMseq converted SMF gDNA.
2838
"""
2939
import anndata as ad
3040

31-
HERE = import_HERE()
32-
filepath = HERE / "F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz"
41+
filepath = import_HERE() / "F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz"
3342
return ad.read_h5ad(filepath)

src/smftools/informatics/converted_BAM_to_adata.py

Lines changed: 31 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -27,38 +27,39 @@
2727

2828

2929
def converted_BAM_to_adata(
30-
converted_FASTA,
31-
split_dir,
32-
output_dir,
33-
input_already_demuxed,
34-
mapping_threshold,
35-
experiment_name,
36-
conversions,
37-
bam_suffix,
38-
device="cpu",
39-
num_threads=8,
40-
deaminase_footprinting=False,
41-
delete_intermediates=True,
42-
double_barcoded_path=None,
43-
):
44-
"""
45-
Converts BAM files into an AnnData object by binarizing modified base identities.
46-
47-
Parameters:
48-
converted_FASTA (Path): Path to the converted FASTA reference.
49-
split_dir (Path): Directory containing converted BAM files.
50-
output_dir (Path): Directory of the output dir
51-
input_already_demuxed (bool): Whether input reads were originally demuxed
52-
mapping_threshold (float): Minimum fraction of aligned reads required for inclusion.
53-
experiment_name (str): Name for the output AnnData object.
54-
conversions (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
55-
bam_suffix (str): File suffix for BAM files.
56-
num_threads (int): Number of parallel processing threads.
57-
deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
58-
double_barcoded_path (Path): Path to dorado demux summary file of double ended barcodes
30+
converted_FASTA: str | Path,
31+
split_dir: Path,
32+
output_dir: Path,
33+
input_already_demuxed: bool,
34+
mapping_threshold: float,
35+
experiment_name: str,
36+
conversions: list[str],
37+
bam_suffix: str,
38+
device: str | torch.device = "cpu",
39+
num_threads: int = 8,
40+
deaminase_footprinting: bool = False,
41+
delete_intermediates: bool = True,
42+
double_barcoded_path: Path | None = None,
43+
) -> tuple[ad.AnnData | None, Path]:
44+
"""Convert BAM files into an AnnData object by binarizing modified base identities.
45+
46+
Args:
47+
converted_FASTA: Path to the converted FASTA reference.
48+
split_dir: Directory containing converted BAM files.
49+
output_dir: Output directory for intermediate and final files.
50+
input_already_demuxed: Whether input reads were originally demultiplexed.
51+
mapping_threshold: Minimum fraction of aligned reads required for inclusion.
52+
experiment_name: Name for the output AnnData object.
53+
conversions: List of modification types (e.g., ``["unconverted", "5mC", "6mA"]``).
54+
bam_suffix: File suffix for BAM files.
55+
device: Torch device or device string.
56+
num_threads: Number of parallel processing threads.
57+
deaminase_footprinting: Whether the footprinting used direct deamination chemistry.
58+
delete_intermediates: Whether to remove intermediate files after processing.
59+
double_barcoded_path: Path to dorado demux summary file of double-ended barcodes.
5960
6061
Returns:
61-
str: Path to the final AnnData object.
62+
tuple[anndata.AnnData | None, Path]: The AnnData object (if generated) and its path.
6263
"""
6364
if torch.cuda.is_available():
6465
device = torch.device("cuda")

0 commit comments

Comments
 (0)