Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions docs/tutorials/3_train.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.11/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
"/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
Expand All @@ -94,10 +94,10 @@
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33manony-mouse-945272810042178709\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33manony-moose-627961369310828315\u001b[0m to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: Downloading large artifact fragment_file:latest, 2203.42MB. 1 files... \n",
"\u001b[34m\u001b[1mwandb\u001b[0m: 1 of 1 files downloaded. \n",
"Done. 0:0:36.8\n",
"Done. 0:0:4.9 (447.4MB/s)\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: 1 of 1 files downloaded. \n"
]
}
Expand Down Expand Up @@ -445,7 +445,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"id": "0ad3c925-162b-4bca-a8e6-6a9b99fe8a15",
"metadata": {
"scrolled": true
Expand Down Expand Up @@ -516,7 +516,7 @@
"20216 chr1 858284 860398"
]
},
"execution_count": 11,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -527,7 +527,6 @@
" binwidth=0.02, # resolution of measuring GC content\n",
" genome=genome,\n",
" chroms=\"autosomes\", # negative regions will also be chosen from autosomes\n",
" blacklist=genome, # negative regions overlapping the blacklist will be dropped\n",
" seed=0,\n",
")\n",
"negatives.head(3)"
Expand Down Expand Up @@ -1301,7 +1300,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.11.10"
}
},
"nbformat": 4,
Expand Down
19 changes: 14 additions & 5 deletions src/grelu/data/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,7 +543,8 @@ def get_gc_matched_intervals(
genome: Name of the genome corresponding to intervals
binwidth: Resolution of GC content
chroms: Chromosomes to search for matched intervals
blacklist: Blacklist file of regions to exclude
blacklist: Blacklist file of regions to exclude. If None, the
genome name will be used to find the appropriate blacklist file.
seed: Random seed

Returns:
Expand All @@ -554,7 +555,7 @@ def get_gc_matched_intervals(
from grelu.io.genome import get_genome
from grelu.sequence.utils import get_unique_length

genome = get_genome(genome)
genome_obj = get_genome(genome)
chroms = get_chromosomes(chroms)

# Get seq_len
Expand All @@ -563,7 +564,7 @@ def get_gc_matched_intervals(
print("Extracting matching intervals")
matched_loci = extract_matching_loci(
intervals,
fasta=genome.genome_file,
fasta=genome_obj.genome_file,
in_window=seq_len,
gc_bin_width=binwidth,
chroms=chroms,
Expand All @@ -572,9 +573,17 @@ def get_gc_matched_intervals(
)

print("Filtering blacklist")
if blacklist is not None:
if blacklist is None:
try:
matched_loci = filter_blacklist(
data=matched_loci, genome=genome
)
except:
print(f"Failed to load a blacklist file for genome {genome}.")
print("Skipping blacklist filtering.")
else:
matched_loci = filter_blacklist(
data=matched_loci, genome=genome, blacklist=blacklist
data=matched_loci, blacklist=blacklist
)
return matched_loci

Expand Down
15 changes: 15 additions & 0 deletions tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
filter_overlapping,
merge_intervals_by_column,
split,
get_gc_matched_intervals
)


Expand Down Expand Up @@ -213,3 +214,17 @@ def test_merge_intervals_by_column():
}
)
merge_intervals_by_column(intervals, group_col="gene")


def test_get_gc_matched_intervals():
intervals = pd.DataFrame(
{
"chrom": ["chr10"],
"start": [int(1e7)],
"end": [int(1e7+10)],
}
)

res = get_gc_matched_intervals(
intervals=intervals, genome='hg38', chroms=['chr21'])
assert len(res) == 1