Skip to content

Commit b74cbec

Browse files
mwiewiorclaude
andcommitted
feat: Add cluster, complement, and subtract range operations
Integrate three new bioframe-compatible interval operations from upstream datafusion-bio-functions PR #17: - cluster: Assign cluster IDs to overlapping/nearby intervals - complement: Compute gaps between intervals (with optional chromsizes view) - subtract: Remove overlapping portions of intervals from another set All operations support the full polars-bio pipeline: DataFrame, LazyFrame, pandas, file paths, and projection pushdown. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ff1242e commit b74cbec

File tree

8 files changed

+543
-12
lines changed

8 files changed

+543
-12
lines changed

Cargo.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ crate-type= ["cdylib"]
1414
datafusion-python = "50.1.0"
1515
pyo3 = { version = "0.25.1", features = ["extension-module", "abi3"] }
1616
pyo3-log = "0.12.4"
17-
datafusion-bio-function-ranges = { git = "https://github.com/biodatageeks/datafusion-bio-functions.git", rev = "6893418f6f0cc0bb31146c58f3f60a0b96d29f76" }
17+
datafusion-bio-function-ranges = { git = "https://github.com/biodatageeks/datafusion-bio-functions.git", rev = "e27228c38982e9cbd3e767eb9d32700ed6fe3054" }
1818

1919
datafusion = { version = "50.3.0"}
2020
arrow = "56.1.0"
@@ -35,7 +35,7 @@ datafusion-bio-format-bed = { git = "https://github.com/biodatageeks/datafusion-
3535
datafusion-bio-format-fasta = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "4ba1ca3e108a5edc5d31d03bacbe04f2ddf0b64d" }
3636
datafusion-bio-format-pairs = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "4ba1ca3e108a5edc5d31d03bacbe04f2ddf0b64d" }
3737

38-
datafusion-bio-function-pileup = { git = "https://github.com/biodatageeks/datafusion-bio-functions.git", rev = "fddc89a6e12f84d8a51899ceb7ef2f56bfc866e1", default-features = false }
38+
datafusion-bio-function-pileup = { git = "https://github.com/biodatageeks/datafusion-bio-functions.git", rev = "73309715df876f7e77f0ddc80111440ccb3dd19d", default-features = false }
3939

4040
async-trait = "0.1.86"
4141
futures = "0.3.31"

polars_bio/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@
101101
count_overlaps = range_operations.count_overlaps
102102
coverage = range_operations.coverage
103103
merge = range_operations.merge
104+
cluster = range_operations.cluster
105+
complement = range_operations.complement
106+
subtract = range_operations.subtract
104107

105108
POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"
106109

@@ -183,5 +186,8 @@
183186
"count_overlaps",
184187
"coverage",
185188
"merge",
189+
"cluster",
190+
"complement",
191+
"subtract",
186192
"visualize_intervals",
187193
]

polars_bio/range_op.py

Lines changed: 238 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,15 @@
2424
pd = None
2525

2626

27-
__all__ = ["overlap", "nearest", "count_overlaps", "merge"]
27+
__all__ = [
28+
"overlap",
29+
"nearest",
30+
"count_overlaps",
31+
"merge",
32+
"cluster",
33+
"complement",
34+
"subtract",
35+
]
2836

2937

3038
from polars_bio.polars_bio import FilterOp, RangeOp, RangeOptions
@@ -609,3 +617,232 @@ def merge(
609617
ctx,
610618
projection_pushdown=projection_pushdown,
611619
)
620+
621+
@staticmethod
622+
def cluster(
623+
df: Union[str, pl.DataFrame, pl.LazyFrame, "pd.DataFrame"],
624+
min_dist: int = 0,
625+
cols: Union[list[str], None] = ["chrom", "start", "end"],
626+
output_type: str = "polars.LazyFrame",
627+
projection_pushdown: bool = True,
628+
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
629+
"""
630+
Assign cluster IDs to overlapping or nearby genomic intervals.
631+
632+
Groups intervals that overlap or are within ``min_dist`` of each other
633+
into clusters. Each row is annotated with a cluster ID and the
634+
cluster's merged start/end boundaries.
635+
636+
Bioframe inspired API.
637+
638+
The coordinate system (0-based or 1-based) is automatically detected from
639+
DataFrame metadata set at I/O time.
640+
641+
Parameters:
642+
df: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.
643+
min_dist: Minimum distance (integer) between intervals to cluster. Default is 0.
644+
cols: The names of columns containing the chromosome, start and end of the
645+
genomic intervals.
646+
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
647+
projection_pushdown: Enable column projection pushdown.
648+
649+
Returns:
650+
**polars.LazyFrame** or polars.DataFrame or pandas.DataFrame with original
651+
interval columns plus ``cluster``, ``cluster_start``, ``cluster_end``.
652+
653+
Raises:
654+
MissingCoordinateSystemError: If input lacks coordinate system metadata
655+
and ``datafusion.bio.coordinate_system_check`` is "true" (default).
656+
"""
657+
suffixes = ("_1", "_2")
658+
_validate_overlap_input(cols, cols, None, suffixes, output_type)
659+
660+
filter_op = _get_filter_op_from_metadata_single(df)
661+
662+
cols = DEFAULT_INTERVAL_COLUMNS if cols is None else cols
663+
range_options = RangeOptions(
664+
range_op=RangeOp.Cluster,
665+
filter_op=filter_op,
666+
columns_1=cols,
667+
columns_2=cols,
668+
min_dist=min_dist,
669+
)
670+
671+
return range_operation(
672+
df,
673+
df,
674+
range_options,
675+
output_type,
676+
ctx,
677+
projection_pushdown=projection_pushdown,
678+
)
679+
680+
@staticmethod
681+
def complement(
682+
df: Union[str, pl.DataFrame, pl.LazyFrame, "pd.DataFrame"],
683+
view_df: Union[pl.DataFrame, pl.LazyFrame, "pd.DataFrame", None] = None,
684+
cols: Union[list[str], None] = ["chrom", "start", "end"],
685+
view_cols: Union[list[str], None] = None,
686+
output_type: str = "polars.LazyFrame",
687+
projection_pushdown: bool = True,
688+
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
689+
"""
690+
Compute the complement of genomic intervals — the gaps between them.
691+
692+
Returns intervals that represent the genomic regions **not** covered
693+
by the input intervals. If ``view_df`` is provided, gaps are computed
694+
within the boundaries of the view (e.g., chromosome sizes); otherwise
695+
each contig spans ``[0, i64::MAX)``.
696+
697+
Bioframe inspired API.
698+
699+
The coordinate system (0-based or 1-based) is automatically detected from
700+
DataFrame metadata set at I/O time.
701+
702+
Parameters:
703+
df: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.
704+
view_df: Optional DataFrame defining contig boundaries (e.g., chromosome sizes). Each row should have contig, start, end columns.
705+
cols: The names of columns containing the chromosome, start and end of the
706+
genomic intervals.
707+
view_cols: Column names for the view table. Defaults to ``cols`` when not specified.
708+
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
709+
projection_pushdown: Enable column projection pushdown.
710+
711+
Returns:
712+
**polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of complement
713+
intervals (contig, start, end).
714+
715+
Raises:
716+
MissingCoordinateSystemError: If input lacks coordinate system metadata
717+
and ``datafusion.bio.coordinate_system_check`` is "true" (default).
718+
"""
719+
suffixes = ("_1", "_2")
720+
_validate_overlap_input(cols, cols, None, suffixes, output_type)
721+
722+
filter_op = _get_filter_op_from_metadata_single(df)
723+
724+
cols = DEFAULT_INTERVAL_COLUMNS if cols is None else cols
725+
view_cols = cols if view_cols is None else view_cols
726+
727+
# Register view table in DataFusion if provided
728+
view_table_name = None
729+
if view_df is not None:
730+
view_table_name = _register_view_table(view_df, view_cols[0])
731+
732+
range_options = RangeOptions(
733+
range_op=RangeOp.Complement,
734+
filter_op=filter_op,
735+
columns_1=cols,
736+
columns_2=cols,
737+
view_table=view_table_name,
738+
view_columns=view_cols,
739+
)
740+
741+
return range_operation(
742+
df,
743+
df,
744+
range_options,
745+
output_type,
746+
ctx,
747+
projection_pushdown=projection_pushdown,
748+
)
749+
750+
@staticmethod
751+
def subtract(
752+
df1: Union[str, pl.DataFrame, pl.LazyFrame, "pd.DataFrame"],
753+
df2: Union[str, pl.DataFrame, pl.LazyFrame, "pd.DataFrame"],
754+
cols1: Union[list[str], None] = ["chrom", "start", "end"],
755+
cols2: Union[list[str], None] = ["chrom", "start", "end"],
756+
output_type: str = "polars.LazyFrame",
757+
projection_pushdown: bool = True,
758+
) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
759+
"""
760+
Subtract the second set of intervals from the first.
761+
762+
For each interval in ``df1``, removes any portion that overlaps with
763+
intervals in ``df2``. The result contains the remaining fragments.
764+
765+
Bioframe inspired API.
766+
767+
The coordinate system (0-based or 1-based) is automatically detected from
768+
DataFrame metadata set at I/O time. Both inputs must have the same coordinate
769+
system.
770+
771+
Parameters:
772+
df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.
773+
df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.
774+
cols1: The names of columns containing the chromosome, start and end of the
775+
genomic intervals for the first set.
776+
cols2: The names of columns containing the chromosome, start and end of the
777+
genomic intervals for the second set.
778+
output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
779+
projection_pushdown: Enable column projection pushdown.
780+
781+
Returns:
782+
**polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the
783+
remaining interval fragments (contig, start, end).
784+
785+
Raises:
786+
MissingCoordinateSystemError: If either input lacks coordinate system metadata
787+
and ``datafusion.bio.coordinate_system_check`` is "true" (default).
788+
CoordinateSystemMismatchError: If inputs have different coordinate systems.
789+
"""
790+
suffixes = ("_1", "_2")
791+
_validate_overlap_input(cols1, cols2, None, suffixes, output_type)
792+
793+
filter_op = _get_filter_op_from_metadata(df1, df2)
794+
795+
cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
796+
cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
797+
range_options = RangeOptions(
798+
range_op=RangeOp.Subtract,
799+
filter_op=filter_op,
800+
columns_1=cols1,
801+
columns_2=cols2,
802+
)
803+
804+
return range_operation(
805+
df1,
806+
df2,
807+
range_options,
808+
output_type,
809+
ctx,
810+
projection_pushdown=projection_pushdown,
811+
)
812+
813+
814+
def _register_view_table(
815+
view_df: Union[pl.DataFrame, pl.LazyFrame, "pd.DataFrame"],
816+
contig_col: str,
817+
) -> str:
818+
"""Register a DataFrame into DataFusion context for use as a view table.
819+
820+
Returns the generated table name.
821+
"""
822+
import pyarrow as pa
823+
824+
from polars_bio.polars_bio import py_from_polars
825+
826+
table_name = f"_view_{id(view_df)}_{hash(contig_col) & 0xFFFFFFFF:08x}"
827+
828+
if isinstance(view_df, pl.LazyFrame):
829+
view_df = view_df.collect()
830+
831+
if isinstance(view_df, pl.DataFrame):
832+
arrow_tbl = view_df.to_arrow()
833+
elif pd is not None and isinstance(view_df, pd.DataFrame):
834+
arrow_tbl = pa.Table.from_pandas(view_df)
835+
# Convert string column to LargeString for DataFusion compatibility
836+
idx = arrow_tbl.schema.get_field_index(contig_col)
837+
if arrow_tbl.schema.field(idx).type == pa.string():
838+
arrow_tbl = arrow_tbl.set_column(
839+
idx,
840+
arrow_tbl.schema.field(idx).name,
841+
pa.compute.cast(arrow_tbl.column(idx), pa.large_string()),
842+
)
843+
else:
844+
raise ValueError("view_df must be a Polars or Pandas DataFrame")
845+
846+
reader = arrow_tbl.to_reader()
847+
py_from_polars(ctx, table_name, reader)
848+
return table_name

polars_bio/range_op_helpers.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,52 @@ def _generate_merge_schema(columns: list[str]) -> pl.Schema:
8989
)
9090

9191

92+
def _generate_cluster_schema(columns: list[str]) -> pl.Schema:
93+
"""Generate schema for cluster operations.
94+
95+
ClusterProvider outputs: (contig: Utf8, start: Int64, end: Int64,
96+
cluster: Int64, cluster_start: Int64, cluster_end: Int64).
97+
"""
98+
return pl.Schema(
99+
{
100+
columns[0]: pl.Utf8,
101+
columns[1]: pl.Int64,
102+
columns[2]: pl.Int64,
103+
"cluster": pl.Int64,
104+
"cluster_start": pl.Int64,
105+
"cluster_end": pl.Int64,
106+
}
107+
)
108+
109+
110+
def _generate_complement_schema(columns: list[str]) -> pl.Schema:
111+
"""Generate schema for complement operations.
112+
113+
ComplementProvider outputs: (contig: Utf8, start: Int64, end: Int64).
114+
"""
115+
return pl.Schema(
116+
{
117+
columns[0]: pl.Utf8,
118+
columns[1]: pl.Int64,
119+
columns[2]: pl.Int64,
120+
}
121+
)
122+
123+
124+
def _generate_subtract_schema(columns: list[str]) -> pl.Schema:
125+
"""Generate schema for subtract operations.
126+
127+
SubtractProvider outputs: (contig: Utf8, start: Int64, end: Int64).
128+
"""
129+
return pl.Schema(
130+
{
131+
columns[0]: pl.Utf8,
132+
columns[1]: pl.Int64,
133+
columns[2]: pl.Int64,
134+
}
135+
)
136+
137+
92138
def _lazyframe_to_dataframe(
93139
df: Union[pl.LazyFrame, "GffLazyFrameWrapper"],
94140
) -> pl.DataFrame:
@@ -160,6 +206,12 @@ def range_operation(
160206
)
161207
elif range_options.range_op == RangeOp.Merge:
162208
merged_schema = _generate_merge_schema(range_options.columns_1)
209+
elif range_options.range_op == RangeOp.Cluster:
210+
merged_schema = _generate_cluster_schema(range_options.columns_1)
211+
elif range_options.range_op == RangeOp.Complement:
212+
merged_schema = _generate_complement_schema(range_options.columns_1)
213+
elif range_options.range_op == RangeOp.Subtract:
214+
merged_schema = _generate_subtract_schema(range_options.columns_1)
163215
else:
164216
# Get the base schemas without suffixes first
165217
df_schema1_base = _get_schema(df1, ctx, None, read_options1)
@@ -244,6 +296,12 @@ def range_operation(
244296
merged_schema = pl.Schema({**df2_base_schema, **{"coverage": pl.Int64}})
245297
elif range_options.range_op == RangeOp.Merge:
246298
merged_schema = _generate_merge_schema(range_options.columns_1)
299+
elif range_options.range_op == RangeOp.Cluster:
300+
merged_schema = _generate_cluster_schema(range_options.columns_1)
301+
elif range_options.range_op == RangeOp.Complement:
302+
merged_schema = _generate_complement_schema(range_options.columns_1)
303+
elif range_options.range_op == RangeOp.Subtract:
304+
merged_schema = _generate_subtract_schema(range_options.columns_1)
247305
else:
248306
merged_schema = _generate_overlap_schema(
249307
df1_base_schema, df2_base_schema, range_options

0 commit comments

Comments
 (0)