From 79531ef6fc734871c974fc3a452a4699da4ac820 Mon Sep 17 00:00:00 2001 From: avantikalal Date: Wed, 6 Aug 2025 21:14:42 +0000 Subject: [PATCH 1/2] fixed issue with gc matching --- src/grelu/data/preprocess.py | 19 ++++++++++++++----- tests/test_preprocess.py | 15 +++++++++++++++ 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/src/grelu/data/preprocess.py b/src/grelu/data/preprocess.py index d1406515..a316ad7f 100644 --- a/src/grelu/data/preprocess.py +++ b/src/grelu/data/preprocess.py @@ -543,7 +543,8 @@ def get_gc_matched_intervals( genome: Name of the genome corresponding to intervals binwidth: Resolution of GC content chroms: Chromosomes to search for matched intervals - blacklist: Blacklist file of regions to exclude + blacklist: Blacklist file of regions to exclude. If None, the + genome name will be used to find the appropriate blacklist file. seed: Random seed Returns: @@ -554,7 +555,7 @@ def get_gc_matched_intervals( from grelu.io.genome import get_genome from grelu.sequence.utils import get_unique_length - genome = get_genome(genome) + genome_obj = get_genome(genome) chroms = get_chromosomes(chroms) # Get seq_len @@ -563,7 +564,7 @@ def get_gc_matched_intervals( print("Extracting matching intervals") matched_loci = extract_matching_loci( intervals, - fasta=genome.genome_file, + fasta=genome_obj.genome_file, in_window=seq_len, gc_bin_width=binwidth, chroms=chroms, @@ -572,9 +573,17 @@ def get_gc_matched_intervals( ) print("Filtering blacklist") - if blacklist is not None: + if blacklist is None: + try: + matched_loci = filter_blacklist( + data=matched_loci, genome=genome + ) + except: + print(f"Failed to load a blacklist file for genome {genome}.") + print("Skipping blacklist filtering.") + else: matched_loci = filter_blacklist( - data=matched_loci, genome=genome, blacklist=blacklist + data=matched_loci, blacklist=blacklist ) return matched_loci diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py index 734e5d34..48624bd5 100644 --- a/tests/test_preprocess.py +++ b/tests/test_preprocess.py @@ -12,6 +12,7 @@ filter_overlapping, merge_intervals_by_column, split, + get_gc_matched_intervals ) @@ -213,3 +214,17 @@ def test_merge_intervals_by_column(): } ) merge_intervals_by_column(intervals, group_col="gene") + + +def test_get_gc_matched_intervals(): + intervals = pd.DataFrame( + { + "chrom": ["chr10"], + "start": [int(1e7)], + "end": [int(1e7+10)], + } + ) + + res = get_gc_matched_intervals( + intervals=intervals, genome='hg38', chroms=['chr21']) + assert len(res) == 1 From 343a222d8fe0b1fb47527bf945971127a56ae266 Mon Sep 17 00:00:00 2001 From: avantikalal Date: Wed, 6 Aug 2025 21:56:49 +0000 Subject: [PATCH 2/2] reran tutorial 3 --- docs/tutorials/3_train.ipynb | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/docs/tutorials/3_train.ipynb b/docs/tutorials/3_train.ipynb index 1bdbacbc..8fe08328 100644 --- a/docs/tutorials/3_train.ipynb +++ b/docs/tutorials/3_train.ipynb @@ -75,7 +75,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python3.11/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } @@ -94,10 +94,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33manony-mouse-945272810042178709\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n", + "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33manony-moose-627961369310828315\u001b[0m to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n", "\u001b[34m\u001b[1mwandb\u001b[0m: Downloading large artifact fragment_file:latest, 2203.42MB. 1 files... \n", "\u001b[34m\u001b[1mwandb\u001b[0m: 1 of 1 files downloaded. \n", - "Done. 0:0:36.8\n", + "Done. 0:0:4.9 (447.4MB/s)\n", "\u001b[34m\u001b[1mwandb\u001b[0m: 1 of 1 files downloaded. \n" ] } @@ -445,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "0ad3c925-162b-4bca-a8e6-6a9b99fe8a15", "metadata": { "scrolled": true @@ -516,7 +516,7 @@ "20216 chr1 858284 860398" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -527,7 +527,6 @@ " binwidth=0.02, # resolution of measuring GC content\n", " genome=genome,\n", " chroms=\"autosomes\", # negative regions will also be chosen from autosomes\n", - " blacklist=genome, # negative regions overlapping the blacklist will be dropped\n", " seed=0,\n", ")\n", "negatives.head(3)" @@ -1301,7 +1300,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.10" } }, "nbformat": 4,