Skip to content

Commit 443744a

Browse files
authored
filter transcripts fixed for bytes.
1 parent 5a62798 commit 443744a

File tree

1 file changed

+12
-3
lines changed

1 file changed

+12
-3
lines changed

src/segger/data/utils.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def try_import(module_name):
4343
from datetime import timedelta
4444

4545

46-
def filter_transcripts(
46+
def filter_transcripts( #ONLY FOR XENIUM
4747
transcripts_df: pd.DataFrame,
4848
min_qv: float = 20.0,
4949
) -> pd.DataFrame:
@@ -65,8 +65,17 @@ def filter_transcripts(
6565
"DeprecatedCodeword_",
6666
"UnassignedCodeword_",
6767
)
68-
mask = transcripts_df["qv"].ge(min_qv)
69-
mask &= ~transcripts_df["feature_name"].str.startswith(filter_codewords)
68+
69+
transcripts_df['feature_name'] = transcripts_df['feature_name'].apply(
70+
lambda x: x.decode("utf-8") if isinstance(x, bytes) else x
71+
)
72+
mask_quality = transcripts_df['qv'] >= min_qv
73+
74+
# Apply the filter for unwanted codewords using Dask string functions
75+
mask_codewords = ~transcripts_df['feature_name'].str.startswith(filter_codewords)
76+
77+
# Combine the filters and return the filtered Dask DataFrame
78+
mask = mask_quality & mask_codewords
7079
return transcripts_df[mask]
7180

7281

0 commit comments

Comments
 (0)