From 79531ef6fc734871c974fc3a452a4699da4ac820 Mon Sep 17 00:00:00 2001
From: avantikalal <avantikalal1990@gmail.com>
Date: Wed, 6 Aug 2025 21:14:42 +0000
Subject: [PATCH 1/2] fixed issue with gc matching

---
 src/grelu/data/preprocess.py | 19 ++++++++++++++-----
 tests/test_preprocess.py     | 15 +++++++++++++++
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/src/grelu/data/preprocess.py b/src/grelu/data/preprocess.py
index d1406515..a316ad7f 100644
--- a/src/grelu/data/preprocess.py
+++ b/src/grelu/data/preprocess.py
@@ -543,7 +543,8 @@ def get_gc_matched_intervals(
         genome: Name of the genome corresponding to intervals
         binwidth: Resolution of GC content
         chroms: Chromosomes to search for matched intervals
-        blacklist: Blacklist file of regions to exclude
+        blacklist: Blacklist file of regions to exclude. If None, the
+            genome name will be used to find the appropriate blacklist file.
         seed: Random seed
 
     Returns:
@@ -554,7 +555,7 @@ def get_gc_matched_intervals(
     from grelu.io.genome import get_genome
     from grelu.sequence.utils import get_unique_length
 
-    genome = get_genome(genome)
+    genome_obj = get_genome(genome)
     chroms = get_chromosomes(chroms)
 
     # Get seq_len
@@ -563,7 +564,7 @@ def get_gc_matched_intervals(
     print("Extracting matching intervals")
     matched_loci = extract_matching_loci(
         intervals,
-        fasta=genome.genome_file,
+        fasta=genome_obj.genome_file,
         in_window=seq_len,
         gc_bin_width=binwidth,
         chroms=chroms,
@@ -572,9 +573,17 @@ def get_gc_matched_intervals(
     )
 
     print("Filtering blacklist")
-    if blacklist is not None:
+    if blacklist is None:
+        try:
+            matched_loci = filter_blacklist(
+            data=matched_loci, genome=genome
+        )
+        except:
+            print(f"Failed to load a blacklist file for genome {genome}.")
+            print("Skipping blacklist filtering.")
+    else:
         matched_loci = filter_blacklist(
-            data=matched_loci, genome=genome, blacklist=blacklist
+            data=matched_loci, blacklist=blacklist
         )
     return matched_loci
 
diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py
index 734e5d34..48624bd5 100644
--- a/tests/test_preprocess.py
+++ b/tests/test_preprocess.py
@@ -12,6 +12,7 @@
     filter_overlapping,
     merge_intervals_by_column,
     split,
+    get_gc_matched_intervals
 )
 
 
@@ -213,3 +214,17 @@ def test_merge_intervals_by_column():
             }
         )
         merge_intervals_by_column(intervals, group_col="gene")
+
+
+def test_get_gc_matched_intervals():
+    intervals = pd.DataFrame(
+        {
+            "chrom": ["chr10"],
+            "start": [int(1e7)],
+            "end": [int(1e7+10)],
+        }
+    )
+
+    res = get_gc_matched_intervals(
+        intervals=intervals, genome='hg38', chroms=['chr21'])
+    assert len(res) == 1

From 343a222d8fe0b1fb47527bf945971127a56ae266 Mon Sep 17 00:00:00 2001
From: avantikalal <avantikalal1990@gmail.com>
Date: Wed, 6 Aug 2025 21:56:49 +0000
Subject: [PATCH 2/2] reran tutorial 3

---
 docs/tutorials/3_train.ipynb | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/docs/tutorials/3_train.ipynb b/docs/tutorials/3_train.ipynb
index 1bdbacbc..8fe08328 100644
--- a/docs/tutorials/3_train.ipynb
+++ b/docs/tutorials/3_train.ipynb
@@ -75,7 +75,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/usr/local/lib/python3.11/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
       "  from .autonotebook import tqdm as notebook_tqdm\n"
      ]
     }
@@ -94,10 +94,10 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33manony-mouse-945272810042178709\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
+      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33manony-moose-627961369310828315\u001b[0m to \u001b[32mhttps://api.wandb.ai\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m: Downloading large artifact fragment_file:latest, 2203.42MB. 1 files... \n",
       "\u001b[34m\u001b[1mwandb\u001b[0m:   1 of 1 files downloaded.  \n",
-      "Done. 0:0:36.8\n",
+      "Done. 0:0:4.9 (447.4MB/s)\n",
       "\u001b[34m\u001b[1mwandb\u001b[0m:   1 of 1 files downloaded.  \n"
      ]
     }
@@ -445,7 +445,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "id": "0ad3c925-162b-4bca-a8e6-6a9b99fe8a15",
    "metadata": {
     "scrolled": true
@@ -516,7 +516,7 @@
        "20216  chr1  858284  860398"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -527,7 +527,6 @@
     "    binwidth=0.02, # resolution of measuring GC content\n",
     "    genome=genome,\n",
     "    chroms=\"autosomes\", # negative regions will also be chosen from autosomes\n",
-    "    blacklist=genome, # negative regions overlapping the blacklist will be dropped\n",
     "    seed=0,\n",
     ")\n",
     "negatives.head(3)"
@@ -1301,7 +1300,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.10"
   }
  },
  "nbformat": 4,