aertslab · mbuttner · Aug 22, 2023 · Aug 23, 2023 · Mar 19, 2025
diff --git a/pycisTopic/pseudobulk_peak_calling.py b/pycisTopic/pseudobulk_peak_calling.py
@@ -23,6 +23,7 @@ def export_pseudobulk(
     bed_path: str,
     bigwig_path: str,
     path_to_fragments: Optional[Dict[str, str]] = None,
+    chrom_filter: Optional[str] = None,
     sample_id_col: Optional[str] = "sample_id",
     n_cpu: Optional[int] = 1,
     normalize_bigwig: Optional[bool] = True,
@@ -55,6 +56,9 @@ def export_pseudobulk(
             A dictionary of character strings, with sample name as names indicating the path to the fragments file/s from which pseudobulk profiles have to
             be created. If a :class:`CistopicObject` is provided as input it will be ignored, but if a cell metadata :class:`pd.DataFrame` is provided it
             is necessary to provide it. The keys of the dictionary need to match with the sample_id tag added to the index names of the input data frame.
+    chrom_filter: str, optional
+            A regular expression to filter out scaffolds like GL/KI genes from the fragments list.
+            Example: `"GL|KI"`
     sample_id_col: str, optional
             Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'.
     n_cpu: int, optional
@@ -129,6 +133,12 @@ def export_pseudobulk(
                         prepare_tag_cells(cell_data.index.tolist(), split_pattern)
                     )
                 ]
+            if chrom_filter is not None:
+                fragment_drop = fragments_df.Chromosome.str.contains(chrom_filter)
+                n_fragments_dropped = fragment_drop.sum()
+                log.info("Filtering out " + str(n_fragments_dropped) + " fragments.")
+                fragments_df.drop(fragments_df[fragment_drop].index, inplace=True)
+
             fragments_df_dict[sample_id] = fragments_df
 
     # Set groups
@@ -271,7 +281,7 @@ def export_pseudobulk_one_sample(
             group_fragments_dict[list(group_fragments_dict.keys())[x]]
             for x in range(len(fragments_df_dict))
         ]
-        group_fragments = group_fragments_list[0].append(group_fragments_list[1:])
+        group_fragments = pd.concat(group_fragments_list)
 
     del group_fragments_dict
     del group_fragments_list
@@ -280,6 +290,7 @@ def export_pseudobulk_one_sample(
 
     group_pr = pr.PyRanges(group_fragments)
     if isinstance(bigwig_path, str):
+        log.info("Creating bigwig file for " + str(group))
         bigwig_path_group = os.path.join(bigwig_path, str(group) + ".bw")
         if remove_duplicates:
             group_pr.to_bigwig(
@@ -295,6 +306,7 @@ def export_pseudobulk_one_sample(
                 value_col="Score",
             )
     if isinstance(bed_path, str):
+        log.info("Creating bed file for " + str(group))
         bed_path_group = os.path.join(bed_path, str(group) + ".bed.gz")
         group_pr.to_bed(
             path=bed_path_group, keep=False, compression="infer", chain=False