|
24 | 24 | pd = None |
25 | 25 |
|
26 | 26 |
|
27 | | -__all__ = ["overlap", "nearest", "count_overlaps", "merge"] |
| 27 | +__all__ = [ |
| 28 | + "overlap", |
| 29 | + "nearest", |
| 30 | + "count_overlaps", |
| 31 | + "merge", |
| 32 | + "cluster", |
| 33 | + "complement", |
| 34 | + "subtract", |
| 35 | +] |
28 | 36 |
|
29 | 37 |
|
30 | 38 | from polars_bio.polars_bio import FilterOp, RangeOp, RangeOptions |
@@ -609,3 +617,232 @@ def merge( |
609 | 617 | ctx, |
610 | 618 | projection_pushdown=projection_pushdown, |
611 | 619 | ) |
| 620 | + |
| 621 | + @staticmethod |
| 622 | + def cluster( |
| 623 | + df: Union[str, pl.DataFrame, pl.LazyFrame, "pd.DataFrame"], |
| 624 | + min_dist: int = 0, |
| 625 | + cols: Union[list[str], None] = ["chrom", "start", "end"], |
| 626 | + output_type: str = "polars.LazyFrame", |
| 627 | + projection_pushdown: bool = True, |
| 628 | + ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]: |
| 629 | + """ |
| 630 | + Assign cluster IDs to overlapping or nearby genomic intervals. |
| 631 | +
|
| 632 | + Groups intervals that overlap or are within ``min_dist`` of each other |
| 633 | + into clusters. Each row is annotated with a cluster ID and the |
| 634 | + cluster's merged start/end boundaries. |
| 635 | +
|
| 636 | + Bioframe inspired API. |
| 637 | +
|
| 638 | + The coordinate system (0-based or 1-based) is automatically detected from |
| 639 | + DataFrame metadata set at I/O time. |
| 640 | +
|
| 641 | + Parameters: |
| 642 | + df: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported. |
| 643 | + min_dist: Minimum distance (integer) between intervals to cluster. Default is 0. |
| 644 | + cols: The names of columns containing the chromosome, start and end of the |
| 645 | + genomic intervals. |
| 646 | + output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported. |
| 647 | + projection_pushdown: Enable column projection pushdown. |
| 648 | +
|
| 649 | + Returns: |
| 650 | + **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame with original |
| 651 | + interval columns plus ``cluster``, ``cluster_start``, ``cluster_end``. |
| 652 | +
|
| 653 | + Raises: |
| 654 | + MissingCoordinateSystemError: If input lacks coordinate system metadata |
| 655 | + and ``datafusion.bio.coordinate_system_check`` is "true" (default). |
| 656 | + """ |
| 657 | + suffixes = ("_1", "_2") |
| 658 | + _validate_overlap_input(cols, cols, None, suffixes, output_type) |
| 659 | + |
| 660 | + filter_op = _get_filter_op_from_metadata_single(df) |
| 661 | + |
| 662 | + cols = DEFAULT_INTERVAL_COLUMNS if cols is None else cols |
| 663 | + range_options = RangeOptions( |
| 664 | + range_op=RangeOp.Cluster, |
| 665 | + filter_op=filter_op, |
| 666 | + columns_1=cols, |
| 667 | + columns_2=cols, |
| 668 | + min_dist=min_dist, |
| 669 | + ) |
| 670 | + |
| 671 | + return range_operation( |
| 672 | + df, |
| 673 | + df, |
| 674 | + range_options, |
| 675 | + output_type, |
| 676 | + ctx, |
| 677 | + projection_pushdown=projection_pushdown, |
| 678 | + ) |
| 679 | + |
| 680 | + @staticmethod |
| 681 | + def complement( |
| 682 | + df: Union[str, pl.DataFrame, pl.LazyFrame, "pd.DataFrame"], |
| 683 | + view_df: Union[pl.DataFrame, pl.LazyFrame, "pd.DataFrame", None] = None, |
| 684 | + cols: Union[list[str], None] = ["chrom", "start", "end"], |
| 685 | + view_cols: Union[list[str], None] = None, |
| 686 | + output_type: str = "polars.LazyFrame", |
| 687 | + projection_pushdown: bool = True, |
| 688 | + ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]: |
| 689 | + """ |
| 690 | + Compute the complement of genomic intervals — the gaps between them. |
| 691 | +
|
| 692 | + Returns intervals that represent the genomic regions **not** covered |
| 693 | + by the input intervals. If ``view_df`` is provided, gaps are computed |
| 694 | + within the boundaries of the view (e.g., chromosome sizes); otherwise |
| 695 | + each contig spans ``[0, i64::MAX)``. |
| 696 | +
|
| 697 | + Bioframe inspired API. |
| 698 | +
|
| 699 | + The coordinate system (0-based or 1-based) is automatically detected from |
| 700 | + DataFrame metadata set at I/O time. |
| 701 | +
|
| 702 | + Parameters: |
| 703 | + df: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported. |
| 704 | + view_df: Optional DataFrame defining contig boundaries (e.g., chromosome sizes). Each row should have contig, start, end columns. |
| 705 | + cols: The names of columns containing the chromosome, start and end of the |
| 706 | + genomic intervals. |
| 707 | + view_cols: Column names for the view table. Defaults to ``cols`` when not specified. |
| 708 | + output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported. |
| 709 | + projection_pushdown: Enable column projection pushdown. |
| 710 | +
|
| 711 | + Returns: |
| 712 | + **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of complement |
| 713 | + intervals (contig, start, end). |
| 714 | +
|
| 715 | + Raises: |
| 716 | + MissingCoordinateSystemError: If input lacks coordinate system metadata |
| 717 | + and ``datafusion.bio.coordinate_system_check`` is "true" (default). |
| 718 | + """ |
| 719 | + suffixes = ("_1", "_2") |
| 720 | + _validate_overlap_input(cols, cols, None, suffixes, output_type) |
| 721 | + |
| 722 | + filter_op = _get_filter_op_from_metadata_single(df) |
| 723 | + |
| 724 | + cols = DEFAULT_INTERVAL_COLUMNS if cols is None else cols |
| 725 | + view_cols = cols if view_cols is None else view_cols |
| 726 | + |
| 727 | + # Register view table in DataFusion if provided |
| 728 | + view_table_name = None |
| 729 | + if view_df is not None: |
| 730 | + view_table_name = _register_view_table(view_df, view_cols[0]) |
| 731 | + |
| 732 | + range_options = RangeOptions( |
| 733 | + range_op=RangeOp.Complement, |
| 734 | + filter_op=filter_op, |
| 735 | + columns_1=cols, |
| 736 | + columns_2=cols, |
| 737 | + view_table=view_table_name, |
| 738 | + view_columns=view_cols, |
| 739 | + ) |
| 740 | + |
| 741 | + return range_operation( |
| 742 | + df, |
| 743 | + df, |
| 744 | + range_options, |
| 745 | + output_type, |
| 746 | + ctx, |
| 747 | + projection_pushdown=projection_pushdown, |
| 748 | + ) |
| 749 | + |
| 750 | + @staticmethod |
| 751 | + def subtract( |
| 752 | + df1: Union[str, pl.DataFrame, pl.LazyFrame, "pd.DataFrame"], |
| 753 | + df2: Union[str, pl.DataFrame, pl.LazyFrame, "pd.DataFrame"], |
| 754 | + cols1: Union[list[str], None] = ["chrom", "start", "end"], |
| 755 | + cols2: Union[list[str], None] = ["chrom", "start", "end"], |
| 756 | + output_type: str = "polars.LazyFrame", |
| 757 | + projection_pushdown: bool = True, |
| 758 | + ) -> Union[pl.LazyFrame, pl.DataFrame, "pd.DataFrame", datafusion.DataFrame]: |
| 759 | + """ |
| 760 | + Subtract the second set of intervals from the first. |
| 761 | +
|
| 762 | + For each interval in ``df1``, removes any portion that overlaps with |
| 763 | + intervals in ``df2``. The result contains the remaining fragments. |
| 764 | +
|
| 765 | + Bioframe inspired API. |
| 766 | +
|
| 767 | + The coordinate system (0-based or 1-based) is automatically detected from |
| 768 | + DataFrame metadata set at I/O time. Both inputs must have the same coordinate |
| 769 | + system. |
| 770 | +
|
| 771 | + Parameters: |
| 772 | + df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported. |
| 773 | + df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported. |
| 774 | + cols1: The names of columns containing the chromosome, start and end of the |
| 775 | + genomic intervals for the first set. |
| 776 | + cols2: The names of columns containing the chromosome, start and end of the |
| 777 | + genomic intervals for the second set. |
| 778 | + output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported. |
| 779 | + projection_pushdown: Enable column projection pushdown. |
| 780 | +
|
| 781 | + Returns: |
| 782 | + **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the |
| 783 | + remaining interval fragments (contig, start, end). |
| 784 | +
|
| 785 | + Raises: |
| 786 | + MissingCoordinateSystemError: If either input lacks coordinate system metadata |
| 787 | + and ``datafusion.bio.coordinate_system_check`` is "true" (default). |
| 788 | + CoordinateSystemMismatchError: If inputs have different coordinate systems. |
| 789 | + """ |
| 790 | + suffixes = ("_1", "_2") |
| 791 | + _validate_overlap_input(cols1, cols2, None, suffixes, output_type) |
| 792 | + |
| 793 | + filter_op = _get_filter_op_from_metadata(df1, df2) |
| 794 | + |
| 795 | + cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1 |
| 796 | + cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2 |
| 797 | + range_options = RangeOptions( |
| 798 | + range_op=RangeOp.Subtract, |
| 799 | + filter_op=filter_op, |
| 800 | + columns_1=cols1, |
| 801 | + columns_2=cols2, |
| 802 | + ) |
| 803 | + |
| 804 | + return range_operation( |
| 805 | + df1, |
| 806 | + df2, |
| 807 | + range_options, |
| 808 | + output_type, |
| 809 | + ctx, |
| 810 | + projection_pushdown=projection_pushdown, |
| 811 | + ) |
| 812 | + |
| 813 | + |
| 814 | +def _register_view_table( |
| 815 | + view_df: Union[pl.DataFrame, pl.LazyFrame, "pd.DataFrame"], |
| 816 | + contig_col: str, |
| 817 | +) -> str: |
| 818 | + """Register a DataFrame into DataFusion context for use as a view table. |
| 819 | +
|
| 820 | + Returns the generated table name. |
| 821 | + """ |
| 822 | + import pyarrow as pa |
| 823 | + |
| 824 | + from polars_bio.polars_bio import py_from_polars |
| 825 | + |
| 826 | + table_name = f"_view_{id(view_df)}_{hash(contig_col) & 0xFFFFFFFF:08x}" |
| 827 | + |
| 828 | + if isinstance(view_df, pl.LazyFrame): |
| 829 | + view_df = view_df.collect() |
| 830 | + |
| 831 | + if isinstance(view_df, pl.DataFrame): |
| 832 | + arrow_tbl = view_df.to_arrow() |
| 833 | + elif pd is not None and isinstance(view_df, pd.DataFrame): |
| 834 | + arrow_tbl = pa.Table.from_pandas(view_df) |
| 835 | + # Convert string column to LargeString for DataFusion compatibility |
| 836 | + idx = arrow_tbl.schema.get_field_index(contig_col) |
| 837 | + if arrow_tbl.schema.field(idx).type == pa.string(): |
| 838 | + arrow_tbl = arrow_tbl.set_column( |
| 839 | + idx, |
| 840 | + arrow_tbl.schema.field(idx).name, |
| 841 | + pa.compute.cast(arrow_tbl.column(idx), pa.large_string()), |
| 842 | + ) |
| 843 | + else: |
| 844 | + raise ValueError("view_df must be a Polars or Pandas DataFrame") |
| 845 | + |
| 846 | + reader = arrow_tbl.to_reader() |
| 847 | + py_from_polars(ctx, table_name, reader) |
| 848 | + return table_name |
0 commit comments