|
3 | 3 | Contains pure functions for series/dataframe operations used across the executor. |
4 | 4 | """ |
5 | 5 |
|
6 | | -from typing import Any, Optional, Sequence, Set |
| 6 | +from typing import Any, Optional, Sequence |
7 | 7 |
|
8 | 8 | import pandas as pd |
9 | 9 |
|
10 | 10 | from graphistry.compute.typing import DataFrameT |
11 | 11 |
|
12 | 12 |
|
| 13 | +def _is_cudf_obj(obj: Any) -> bool: |
| 14 | + return hasattr(obj, "__class__") and obj.__class__.__module__.startswith("cudf") |
| 15 | + |
| 16 | + |
| 17 | +def _cudf_index_op(left: Any, right: Any, op: str) -> Any: |
| 18 | + method = getattr(left, op) |
| 19 | + try: |
| 20 | + return method(right, sort=False) |
| 21 | + except TypeError: |
| 22 | + return method(right) |
| 23 | + |
| 24 | + |
13 | 25 | def df_cons(template_df: DataFrameT, data: dict) -> DataFrameT: |
14 | 26 | """Construct a DataFrame of the same type as template_df. |
15 | 27 |
|
@@ -59,26 +71,99 @@ def series_unique(series: Any) -> Any: |
59 | 71 |
|
60 | 72 | For set operations (intersection, union), use series_values() instead. |
61 | 73 | """ |
| 74 | + if _is_cudf_obj(series): |
| 75 | + return series.dropna().unique() |
| 76 | + if isinstance(series, pd.Index): |
| 77 | + return series.dropna().unique() |
62 | 78 | if hasattr(series, 'dropna'): |
63 | 79 | return series.dropna().unique() |
64 | 80 | pandas_series = to_pandas_series(series) |
65 | 81 | return pandas_series.dropna().unique() |
66 | 82 |
|
67 | 83 |
|
68 | | -def series_values(series: Any) -> pd.Index: |
69 | | - """Extract unique non-null values from a series as a pd.Index. |
70 | | -
|
71 | | - Returns pd.Index which supports: |
72 | | - - .intersection() for & operations |
73 | | - - .union() for | operations |
74 | | - - Direct use in .isin() (no conversion needed) |
| 84 | +def series_values(series: Any) -> Any: |
| 85 | + """Extract unique non-null values from a series as an Index-like domain. |
75 | 86 |
|
76 | | - This is ~9x faster than the previous set-based approach. |
| 87 | + Returns a pandas.Index for pandas objects, and cudf.Index for cuDF objects. |
| 88 | + These Index types support .intersection/.union/.difference and are safe to |
| 89 | + pass into .isin() without host syncs. |
77 | 90 | """ |
| 91 | + if _is_cudf_obj(series): |
| 92 | + import cudf # type: ignore |
| 93 | + if isinstance(series, cudf.Index): |
| 94 | + return series.dropna().unique() |
| 95 | + return cudf.Index(series.dropna().unique()) |
| 96 | + if isinstance(series, pd.Index): |
| 97 | + return series.dropna().unique() |
78 | 98 | pandas_series = to_pandas_series(series) |
79 | 99 | return pd.Index(pandas_series.dropna().unique()) |
80 | 100 |
|
81 | 101 |
|
| 102 | +def domain_empty(template: Optional[Any] = None) -> Any: |
| 103 | + if _is_cudf_obj(template): |
| 104 | + import cudf # type: ignore |
| 105 | + return cudf.Index([]) |
| 106 | + return pd.Index([]) |
| 107 | + |
| 108 | + |
| 109 | +def domain_is_empty(domain: Any) -> bool: |
| 110 | + return domain is None or len(domain) == 0 |
| 111 | + |
| 112 | + |
| 113 | +def domain_from_values(values: Any, template: Optional[Any] = None) -> Any: |
| 114 | + if domain_is_empty(values): |
| 115 | + return domain_empty(template) |
| 116 | + if _is_cudf_obj(values): |
| 117 | + import cudf # type: ignore |
| 118 | + if isinstance(values, cudf.Index): |
| 119 | + return values |
| 120 | + return cudf.Index(values) |
| 121 | + if isinstance(values, pd.Index): |
| 122 | + return values |
| 123 | + if _is_cudf_obj(template): |
| 124 | + import cudf # type: ignore |
| 125 | + return cudf.Index(values) |
| 126 | + return pd.Index(values) |
| 127 | + |
| 128 | + |
| 129 | +def domain_intersect(left: Any, right: Any) -> Any: |
| 130 | + if domain_is_empty(left) or domain_is_empty(right): |
| 131 | + return domain_empty(left if left is not None else right) |
| 132 | + if isinstance(left, pd.Index): |
| 133 | + return left.intersection(right) |
| 134 | + if _is_cudf_obj(left): |
| 135 | + return _cudf_index_op(left, right, "intersection") |
| 136 | + return left.intersection(right) |
| 137 | + |
| 138 | + |
| 139 | +def domain_union(left: Any, right: Any) -> Any: |
| 140 | + if domain_is_empty(left): |
| 141 | + return right |
| 142 | + if domain_is_empty(right): |
| 143 | + return left |
| 144 | + if isinstance(left, pd.Index): |
| 145 | + return left.union(right) |
| 146 | + if _is_cudf_obj(left): |
| 147 | + return _cudf_index_op(left, right, "union") |
| 148 | + return left.union(right) |
| 149 | + |
| 150 | + |
| 151 | +def domain_diff(left: Any, right: Any) -> Any: |
| 152 | + if domain_is_empty(left) or domain_is_empty(right): |
| 153 | + return left |
| 154 | + if isinstance(left, pd.Index): |
| 155 | + return left.difference(right) |
| 156 | + if _is_cudf_obj(left): |
| 157 | + return _cudf_index_op(left, right, "difference") |
| 158 | + return left.difference(right) |
| 159 | + |
| 160 | + |
| 161 | +def domain_to_frame(template_df: DataFrameT, domain: Any, col: str) -> DataFrameT: |
| 162 | + if domain is None: |
| 163 | + return df_cons(template_df, {col: []}) |
| 164 | + return df_cons(template_df, {col: domain}) |
| 165 | + |
| 166 | + |
82 | 167 | # Standard column name for ID DataFrames used in semi-joins |
83 | 168 | _ID_COL = "__id__" |
84 | 169 |
|
|
0 commit comments