diff --git a/pycisTopic/pseudobulk_peak_calling.py b/pycisTopic/pseudobulk_peak_calling.py index 7d6fdd1..53fa81d 100644 --- a/pycisTopic/pseudobulk_peak_calling.py +++ b/pycisTopic/pseudobulk_peak_calling.py @@ -23,6 +23,7 @@ def export_pseudobulk( bed_path: str, bigwig_path: str, path_to_fragments: Optional[Dict[str, str]] = None, + chrom_filter: Optional[str] = None, sample_id_col: Optional[str] = "sample_id", n_cpu: Optional[int] = 1, normalize_bigwig: Optional[bool] = True, @@ -55,6 +56,9 @@ def export_pseudobulk( A dictionary of character strings, with sample name as names indicating the path to the fragments file/s from which pseudobulk profiles have to be created. If a :class:`CistopicObject` is provided as input it will be ignored, but if a cell metadata :class:`pd.DataFrame` is provided it is necessary to provide it. The keys of the dictionary need to match with the sample_id tag added to the index names of the input data frame. + chrom_filter: str, optional + A regular expression to filter out scaffolds like GL/KI genes from the fragments list. + Example: `"GL|KI"` sample_id_col: str, optional Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'. n_cpu: int, optional @@ -129,6 +133,12 @@ def export_pseudobulk( prepare_tag_cells(cell_data.index.tolist(), split_pattern) ) ] + if chrom_filter is not None: + fragment_drop = fragments_df.Chromosome.str.contains(chrom_filter) + n_fragments_dropped = fragment_drop.sum() + log.info("Filtering out " + str(n_fragments_dropped) + " fragments.") + fragments_df.drop(fragments_df[fragment_drop].index, inplace=True) + fragments_df_dict[sample_id] = fragments_df # Set groups @@ -271,7 +281,7 @@ def export_pseudobulk_one_sample( group_fragments_dict[list(group_fragments_dict.keys())[x]] for x in range(len(fragments_df_dict)) ] - group_fragments = group_fragments_list[0].append(group_fragments_list[1:]) + group_fragments = pd.concat(group_fragments_list) del group_fragments_dict del group_fragments_list @@ -280,6 +290,7 @@ def export_pseudobulk_one_sample( group_pr = pr.PyRanges(group_fragments) if isinstance(bigwig_path, str): + log.info("Creating bigwig file for " + str(group)) bigwig_path_group = os.path.join(bigwig_path, str(group) + ".bw") if remove_duplicates: group_pr.to_bigwig( @@ -295,6 +306,7 @@ def export_pseudobulk_one_sample( value_col="Score", ) if isinstance(bed_path, str): + log.info("Creating bed file for " + str(group)) bed_path_group = os.path.join(bed_path, str(group) + ".bed.gz") group_pr.to_bed( path=bed_path_group, keep=False, compression="infer", chain=False