biodatageeks
diff --git a/‎Cargo.lock‎
Lines changed: 4 additions & 4 deletions b/‎Cargo.lock‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 2 additions & 2 deletions b/‎Cargo.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎polars_bio/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎polars_bio/__init__.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎polars_bio/range_op.py‎
Lines changed: 238 additions & 1 deletion b/‎polars_bio/range_op.py‎
Lines changed: 238 additions & 1 deletion
diff --git a/‎polars_bio/range_op_helpers.py‎
Lines changed: 58 additions & 0 deletions b/‎polars_bio/range_op_helpers.py‎
Lines changed: 58 additions & 0 deletions
@@ -14,7 +14,7 @@ crate-type= ["cdylib"]
 datafusion-python = "50.1.0"
 pyo3 = { version = "0.25.1", features = ["extension-module", "abi3"] }
 pyo3-log = "0.12.4"
-datafusion-bio-function-ranges = { git = "https://github.com/biodatageeks/datafusion-bio-functions.git", rev = "6893418f6f0cc0bb31146c58f3f60a0b96d29f76" }
+datafusion-bio-function-ranges = { git = "https://github.com/biodatageeks/datafusion-bio-functions.git", rev = "e27228c38982e9cbd3e767eb9d32700ed6fe3054" }
 
 datafusion = { version = "50.3.0"}
 arrow = "56.1.0"
@@ -35,7 +35,7 @@ datafusion-bio-format-bed = { git = "https://github.com/biodatageeks/datafusion-
 datafusion-bio-format-fasta = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "4ba1ca3e108a5edc5d31d03bacbe04f2ddf0b64d" }
 datafusion-bio-format-pairs = { git = "https://github.com/biodatageeks/datafusion-bio-formats.git", rev = "4ba1ca3e108a5edc5d31d03bacbe04f2ddf0b64d" }
 
-datafusion-bio-function-pileup = { git = "https://github.com/biodatageeks/datafusion-bio-functions.git", rev = "fddc89a6e12f84d8a51899ceb7ef2f56bfc866e1", default-features = false }
+datafusion-bio-function-pileup = { git = "https://github.com/biodatageeks/datafusion-bio-functions.git", rev = "73309715df876f7e77f0ddc80111440ccb3dd19d", default-features = false }
 
 async-trait = "0.1.86"
 futures = "0.3.31"
 
@@ -101,6 +101,9 @@
 count_overlaps = range_operations.count_overlaps
 coverage = range_operations.coverage
 merge = range_operations.merge
+cluster = range_operations.cluster
+complement = range_operations.complement
+subtract = range_operations.subtract
 
 POLARS_BIO_MAX_THREADS = "datafusion.execution.target_partitions"
 
@@ -183,5 +186,8 @@
     "count_overlaps",
     "coverage",
     "merge",
+    "cluster",
+    "complement",
+    "subtract",
     "visualize_intervals",
 ]
@@ -24,7 +24,15 @@
     pd = None
 
 
-__all__ = ["overlap", "nearest", "count_overlaps", "merge"]
+__all__ = [
+    "overlap",
+    "nearest",
+    "count_overlaps",
+    "merge",
+    "cluster",
+    "complement",
+    "subtract",
+]
 
 
 from polars_bio.polars_bio import FilterOp, RangeOp, RangeOptions
@@ -609,3 +617,232 @@ def merge(
             ctx,
             projection_pushdown=projection_pushdown,
         )
+
+    @staticmethod
+    def cluster(
+        df: Union[str, pl.DataFrame, pl.LazyFrame, "pd.DataFrame"],
+        min_dist: int = 0,
+        cols: Union[list[str], None] = ["chrom", "start", "end"],
+        output_type: str = "polars.LazyFrame",
+        projection_pushdown: bool = True,
+    ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
+        """
+        Assign cluster IDs to overlapping or nearby genomic intervals.
+
+        Groups intervals that overlap or are within ``min_dist`` of each other
+        into clusters. Each row is annotated with a cluster ID and the
+        cluster's merged start/end boundaries.
+
+        Bioframe inspired API.
+
+        The coordinate system (0-based or 1-based) is automatically detected from
+        DataFrame metadata set at I/O time.
+
+        Parameters:
+            df: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.
+            min_dist: Minimum distance (integer) between intervals to cluster. Default is 0.
+            cols: The names of columns containing the chromosome, start and end of the
+                genomic intervals.
+            output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
+            projection_pushdown: Enable column projection pushdown.
+
+        Returns:
+            **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame with original
+            interval columns plus ``cluster``, ``cluster_start``, ``cluster_end``.
+
+        Raises:
+            MissingCoordinateSystemError: If input lacks coordinate system metadata
+                and ``datafusion.bio.coordinate_system_check`` is "true" (default).
+        """
+        suffixes = ("_1", "_2")
+        _validate_overlap_input(cols, cols, None, suffixes, output_type)
+
+        filter_op = _get_filter_op_from_metadata_single(df)
+
+        cols = DEFAULT_INTERVAL_COLUMNS if cols is None else cols
+        range_options = RangeOptions(
+            range_op=RangeOp.Cluster,
+            filter_op=filter_op,
+            columns_1=cols,
+            columns_2=cols,
+            min_dist=min_dist,
+        )
+
+        return range_operation(
+            df,
+            df,
+            range_options,
+            output_type,
+            ctx,
+            projection_pushdown=projection_pushdown,
+        )
+
+    @staticmethod
+    def complement(
+        df: Union[str, pl.DataFrame, pl.LazyFrame, "pd.DataFrame"],
+        view_df: Union[pl.DataFrame, pl.LazyFrame, "pd.DataFrame", None] = None,
+        cols: Union[list[str], None] = ["chrom", "start", "end"],
+        view_cols: Union[list[str], None] = None,
+        output_type: str = "polars.LazyFrame",
+        projection_pushdown: bool = True,
+    ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
+        """
+        Compute the complement of genomic intervals — the gaps between them.
+
+        Returns intervals that represent the genomic regions **not** covered
+        by the input intervals. If ``view_df`` is provided, gaps are computed
+        within the boundaries of the view (e.g., chromosome sizes); otherwise
+        each contig spans ``[0, i64::MAX)``.
+
+        Bioframe inspired API.
+
+        The coordinate system (0-based or 1-based) is automatically detected from
+        DataFrame metadata set at I/O time.
+
+        Parameters:
+            df: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.
+            view_df: Optional DataFrame defining contig boundaries (e.g., chromosome sizes). Each row should have contig, start, end columns.
+            cols: The names of columns containing the chromosome, start and end of the
+                genomic intervals.
+            view_cols: Column names for the view table. Defaults to ``cols`` when not specified.
+            output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
+            projection_pushdown: Enable column projection pushdown.
+
+        Returns:
+            **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of complement
+            intervals (contig, start, end).
+
+        Raises:
+            MissingCoordinateSystemError: If input lacks coordinate system metadata
+                and ``datafusion.bio.coordinate_system_check`` is "true" (default).
+        """
+        suffixes = ("_1", "_2")
+        _validate_overlap_input(cols, cols, None, suffixes, output_type)
+
+        filter_op = _get_filter_op_from_metadata_single(df)
+
+        cols = DEFAULT_INTERVAL_COLUMNS if cols is None else cols
+        view_cols = cols if view_cols is None else view_cols
+
+        # Register view table in DataFusion if provided
+        view_table_name = None
+        if view_df is not None:
+            view_table_name = _register_view_table(view_df, view_cols[0])
+
+        range_options = RangeOptions(
+            range_op=RangeOp.Complement,
+            filter_op=filter_op,
+            columns_1=cols,
+            columns_2=cols,
+            view_table=view_table_name,
+            view_columns=view_cols,
+        )
+
+        return range_operation(
+            df,
+            df,
+            range_options,
+            output_type,
+            ctx,
+            projection_pushdown=projection_pushdown,
+        )
+
+    @staticmethod
+    def subtract(
+        df1: Union[str, pl.DataFrame, pl.LazyFrame, "pd.DataFrame"],
+        df2: Union[str, pl.DataFrame, pl.LazyFrame, "pd.DataFrame"],
+        cols1: Union[list[str], None] = ["chrom", "start", "end"],
+        cols2: Union[list[str], None] = ["chrom", "start", "end"],
+        output_type: str = "polars.LazyFrame",
+        projection_pushdown: bool = True,
+    ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]:
+        """
+        Subtract the second set of intervals from the first.
+
+        For each interval in ``df1``, removes any portion that overlaps with
+        intervals in ``df2``. The result contains the remaining fragments.
+
+        Bioframe inspired API.
+
+        The coordinate system (0-based or 1-based) is automatically detected from
+        DataFrame metadata set at I/O time. Both inputs must have the same coordinate
+        system.
+
+        Parameters:
+            df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.
+            df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.
+            cols1: The names of columns containing the chromosome, start and end of the
+                genomic intervals for the first set.
+            cols2: The names of columns containing the chromosome, start and end of the
+                genomic intervals for the second set.
+            output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
+            projection_pushdown: Enable column projection pushdown.
+
+        Returns:
+            **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the
+            remaining interval fragments (contig, start, end).
+
+        Raises:
+            MissingCoordinateSystemError: If either input lacks coordinate system metadata
+                and ``datafusion.bio.coordinate_system_check`` is "true" (default).
+            CoordinateSystemMismatchError: If inputs have different coordinate systems.
+        """
+        suffixes = ("_1", "_2")
+        _validate_overlap_input(cols1, cols2, None, suffixes, output_type)
+
+        filter_op = _get_filter_op_from_metadata(df1, df2)
+
+        cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
+        cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
+        range_options = RangeOptions(
+            range_op=RangeOp.Subtract,
+            filter_op=filter_op,
+            columns_1=cols1,
+            columns_2=cols2,
+        )
+
+        return range_operation(
+            df1,
+            df2,
+            range_options,
+            output_type,
+            ctx,
+            projection_pushdown=projection_pushdown,
+        )
+
+
+def _register_view_table(
+    view_df: Union[pl.DataFrame, pl.LazyFrame, "pd.DataFrame"],
+    contig_col: str,
+) -> str:
+    """Register a DataFrame into DataFusion context for use as a view table.
+
+    Returns the generated table name.
+    """
+    import pyarrow as pa
+
+    from polars_bio.polars_bio import py_from_polars
+
+    table_name = f"_view_{id(view_df)}_{hash(contig_col) & 0xFFFFFFFF:08x}"
+
+    if isinstance(view_df, pl.LazyFrame):
+        view_df = view_df.collect()
+
+    if isinstance(view_df, pl.DataFrame):
+        arrow_tbl = view_df.to_arrow()
+    elif pd is not None and isinstance(view_df, pd.DataFrame):
+        arrow_tbl = pa.Table.from_pandas(view_df)
+        # Convert string column to LargeString for DataFusion compatibility
+        idx = arrow_tbl.schema.get_field_index(contig_col)
+        if arrow_tbl.schema.field(idx).type == pa.string():
+            arrow_tbl = arrow_tbl.set_column(
+                idx,
+                arrow_tbl.schema.field(idx).name,
+                pa.compute.cast(arrow_tbl.column(idx), pa.large_string()),
+            )
+    else:
+        raise ValueError("view_df must be a Polars or Pandas DataFrame")
+
+    reader = arrow_tbl.to_reader()
+    py_from_polars(ctx, table_name, reader)
+    return table_name
@@ -89,6 +89,52 @@ def _generate_merge_schema(columns: list[str]) -> pl.Schema:
     )
 
 
+def _generate_cluster_schema(columns: list[str]) -> pl.Schema:
+    """Generate schema for cluster operations.
+
+    ClusterProvider outputs: (contig: Utf8, start: Int64, end: Int64,
+                              cluster: Int64, cluster_start: Int64, cluster_end: Int64).
+    """
+    return pl.Schema(
+        {
+            columns[0]: pl.Utf8,
+            columns[1]: pl.Int64,
+            columns[2]: pl.Int64,
+            "cluster": pl.Int64,
+            "cluster_start": pl.Int64,
+            "cluster_end": pl.Int64,
+        }
+    )
+
+
+def _generate_complement_schema(columns: list[str]) -> pl.Schema:
+    """Generate schema for complement operations.
+
+    ComplementProvider outputs: (contig: Utf8, start: Int64, end: Int64).
+    """
+    return pl.Schema(
+        {
+            columns[0]: pl.Utf8,
+            columns[1]: pl.Int64,
+            columns[2]: pl.Int64,
+        }
+    )
+
+
+def _generate_subtract_schema(columns: list[str]) -> pl.Schema:
+    """Generate schema for subtract operations.
+
+    SubtractProvider outputs: (contig: Utf8, start: Int64, end: Int64).
+    """
+    return pl.Schema(
+        {
+            columns[0]: pl.Utf8,
+            columns[1]: pl.Int64,
+            columns[2]: pl.Int64,
+        }
+    )
+
+
 def _lazyframe_to_dataframe(
     df: Union[pl.LazyFrame, "GffLazyFrameWrapper"],
 ) -> pl.DataFrame:
@@ -160,6 +206,12 @@ def range_operation(
             )
         elif range_options.range_op == RangeOp.Merge:
             merged_schema = _generate_merge_schema(range_options.columns_1)
+        elif range_options.range_op == RangeOp.Cluster:
+            merged_schema = _generate_cluster_schema(range_options.columns_1)
+        elif range_options.range_op == RangeOp.Complement:
+            merged_schema = _generate_complement_schema(range_options.columns_1)
+        elif range_options.range_op == RangeOp.Subtract:
+            merged_schema = _generate_subtract_schema(range_options.columns_1)
         else:
             # Get the base schemas without suffixes first
             df_schema1_base = _get_schema(df1, ctx, None, read_options1)
@@ -244,6 +296,12 @@ def range_operation(
                 merged_schema = pl.Schema({**df2_base_schema, **{"coverage": pl.Int64}})
             elif range_options.range_op == RangeOp.Merge:
                 merged_schema = _generate_merge_schema(range_options.columns_1)
+            elif range_options.range_op == RangeOp.Cluster:
+                merged_schema = _generate_cluster_schema(range_options.columns_1)
+            elif range_options.range_op == RangeOp.Complement:
+                merged_schema = _generate_complement_schema(range_options.columns_1)
+            elif range_options.range_op == RangeOp.Subtract:
+                merged_schema = _generate_subtract_schema(range_options.columns_1)
             else:
                 merged_schema = _generate_overlap_schema(
                     df1_base_schema, df2_base_schema, range_options