-
Notifications
You must be signed in to change notification settings - Fork 218
Cucat Featurization base #486
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 3 commits
cf07249
d73a2db
382e18b
44200ac
777afd4
c1bc6f1
48e4017
6b0b52b
e4b0c0a
7c0c0c6
f344dd8
b6f6388
f185a2f
410c40d
b9067c0
8f0bc3a
b7b8e63
e8eb85a
501ff3b
918ebee
ccf6f47
60de1cf
0b66776
9f086c8
78015f1
616009b
dc38d3b
376890e
b9828c5
8d13cbe
92769bf
af0fc8a
4f78b76
b8a0db2
6e11117
b0d36cd
5677bea
8e15e5e
20200d6
26cd39c
c4c1bd8
d889581
48a7308
ba25c89
8a0ab5c
151ab5b
d63d729
ada126e
21a475d
49976e8
f24411e
d303afb
b34ee85
2456b70
8fc0b22
ee6c523
4808428
a22e85e
86fc662
614fff4
b88e3ea
a72d4b1
4eef71c
0522981
73ba5d1
9da0b11
f9e9260
58d1461
d5acc1a
614d9f3
31b5f5e
bc4f290
74a2460
624c721
2fc6be5
178adba
90bd8b7
5d16a9e
e931456
fc212a8
d4b1fbe
498a4de
aab2ad9
f0eb1bf
867874d
5a69233
cdda3e7
63398b3
b720bc1
ed824ec
1735134
824d940
c7ce92c
30a04a4
50df365
a654f9f
a86be5c
4bd056c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -49,6 +49,16 @@ | |
| SuperVectorizer = Any | ||
| GapEncoder = Any | ||
| SimilarityEncoder = Any | ||
| try: | ||
| from cu_cat import ( | ||
| SuperVectorizer, | ||
| GapEncoder, | ||
| SimilarityEncoder, | ||
| ) # type: ignore | ||
| except: | ||
| SuperVectorizer = Any | ||
| GapEncoder = Any | ||
| SimilarityEncoder = Any | ||
| try: | ||
| from sklearn.preprocessing import FunctionTransformer | ||
| from sklearn.base import BaseEstimator, TransformerMixin | ||
|
|
@@ -93,6 +103,28 @@ def lazy_import_has_min_dependancy(): | |
| except ModuleNotFoundError as e: | ||
| return False, e | ||
|
|
||
| def lazy_import_has_dependancy_cu_cat(): | ||
| import warnings | ||
| warnings.filterwarnings("ignore") | ||
| try: | ||
| import scipy.sparse # noqa | ||
| from scipy import __version__ as scipy_version | ||
| from cu_cat import __version__ as cu_cat_version | ||
| import cu_cat | ||
| from sklearn import __version__ as sklearn_version | ||
| from cuml import __version__ as cuml_version | ||
| import cuml | ||
| from cudf import __version__ as cudf_version | ||
| import cudf | ||
| logger.debug(f"SCIPY VERSION: {scipy_version}") | ||
| logger.debug(f"Cuda CAT VERSION: {cu_cat_version}") | ||
| logger.debug(f"sklearn VERSION: {sklearn_version}") | ||
| logger.debug(f"cuml VERSION: {cuml_version}") | ||
| logger.debug(f"cudf VERSION: {cudf_version}") | ||
| return True, 'ok', cudf | ||
| except ModuleNotFoundError as e: | ||
| return False, e, None | ||
|
|
||
|
|
||
| def assert_imported_text(): | ||
| has_dependancy_text_, import_text_exn, _ = lazy_import_has_dependancy_text() | ||
|
|
@@ -114,6 +146,33 @@ def assert_imported(): | |
| raise import_min_exn | ||
|
|
||
|
|
||
| def assert_cuml_cucat(): | ||
| has_cuml_dependancy_, import_cuml_exn, cudf = lazy_import_has_dependancy_cu_cat() | ||
| if not has_cuml_dependancy_: | ||
| logger.error( # noqa | ||
| "cuml not found, trying running" # noqa | ||
| "`pip install rapids`" # noqa | ||
| ) | ||
| raise import_cuml_exn | ||
|
|
||
|
|
||
| def make_safe_gpu_dataframes(X, y, engine): | ||
| has_cudf_dependancy_, _, cudf = lazy_import_has_dependancy_cu_cat() | ||
| if has_cudf_dependancy_: | ||
| new_kwargs = {} | ||
| kwargs = {'X': X, 'y': y} | ||
| for key, value in kwargs.items(): | ||
| if isinstance(value, cudf.DataFrame) and engine in ["pandas", "dirty_cat", "torch"]: | ||
| new_kwargs[key] = value.to_pandas() | ||
| elif isinstance(value, pd.DataFrame) and engine in ["cuml", "cu_cat"]: | ||
| new_kwargs[key] = cudf.from_pandas(value) | ||
| else: | ||
| new_kwargs[key] = value | ||
| return new_kwargs['X'], new_kwargs['y'] | ||
| else: | ||
| return X, y | ||
|
|
||
|
|
||
| # ############################################################################ | ||
| # | ||
| # Rough calltree | ||
|
|
@@ -137,29 +196,32 @@ def assert_imported(): | |
| # | ||
| # _featurize_or_get_edges_dataframe_if_X_is_None | ||
|
|
||
| FeatureEngineConcrete = Literal["none", "pandas", "dirty_cat", "torch"] | ||
| FeatureEngineConcrete = Literal["none", "pandas", "dirty_cat", "torch", "cu_cat"] | ||
| FeatureEngine = Literal[FeatureEngineConcrete, "auto"] | ||
|
|
||
|
|
||
| def resolve_feature_engine( | ||
| feature_engine: FeatureEngine, | ||
| ) -> FeatureEngineConcrete: # noqa | ||
|
|
||
| if feature_engine in ["none", "pandas", "dirty_cat", "torch"]: | ||
| if feature_engine in ["none", "pandas", "dirty_cat", "torch", "cu_cat"]: | ||
| return feature_engine # type: ignore | ||
|
|
||
| if feature_engine == "auto": | ||
| has_dependancy_text_, _, _ = lazy_import_has_dependancy_text() | ||
| if has_dependancy_text_: | ||
| return "torch" | ||
| has_cuml_dependancy_, _, cudf = lazy_import_has_dependancy_cu_cat() | ||
| if has_cuml_dependancy_: | ||
| return "cu_cat" | ||
| has_min_dependancy_, _ = lazy_import_has_min_dependancy() | ||
| if has_min_dependancy_: | ||
| return "dirty_cat" | ||
| return "pandas" | ||
|
|
||
| raise ValueError( # noqa | ||
| f'feature_engine expected to be "none", ' | ||
| '"pandas", "dirty_cat", "torch", or "auto"' | ||
| '"pandas", "dirty_cat", "torch", "cu_cat", or "auto"' | ||
| f'but received: {feature_engine} :: {type(feature_engine)}' | ||
| ) | ||
|
|
||
|
|
@@ -230,18 +292,19 @@ def features_without_target( | |
| :param y: target DataFrame | ||
| :return: DataFrames of model and target | ||
| """ | ||
| _, _, cudf = lazy_import_has_dependancy_cu_cat() | ||
| if y is None: | ||
| return df | ||
| remove_cols = [] | ||
| if y is None: | ||
| pass | ||
| elif isinstance(y, pd.DataFrame): | ||
| elif isinstance(y, pd.DataFrame) or isinstance(y, cudf.DataFrame): | ||
|
||
| yc = y.columns | ||
| xc = df.columns | ||
| for c in yc: | ||
| if c in xc: | ||
| remove_cols.append(c) | ||
| elif isinstance(y, pd.Series): | ||
| elif isinstance(y, pd.Series) or isinstance(y, cudf.Series): | ||
|
||
| if y.name and (y.name in df.columns): | ||
| remove_cols = [y.name] | ||
| elif isinstance(y, List): | ||
|
|
@@ -265,7 +328,7 @@ def remove_node_column_from_symbolic(X_symbolic, node): | |
| logger.info(f"Removing `{node}` from input X_symbolic list") | ||
| X_symbolic.remove(node) | ||
| return X_symbolic | ||
| if isinstance(X_symbolic, pd.DataFrame): | ||
| if isinstance(X_symbolic, pd.DataFrame) or 'cudf' in str(getmodule(X_symbolic)): | ||
| logger.info(f"Removing `{node}` from input X_symbolic DataFrame") | ||
| return X_symbolic.drop(columns=[node], errors="ignore") | ||
|
|
||
|
|
@@ -619,11 +682,19 @@ def fit_pipeline( | |
| columns = X.columns | ||
| index = X.index | ||
|
|
||
| X = transformer.fit_transform(X) | ||
| if keep_n_decimals: | ||
| X = np.round(X, decimals=keep_n_decimals) # type: ignore # noqa | ||
|
|
||
| return pd.DataFrame(X, columns=columns, index=index) | ||
| X_type = str(getmodule(X)) | ||
| if 'cudf' not in X_type: | ||
| X = transformer.fit_transform(X) | ||
| if keep_n_decimals: | ||
| X = np.round(X, decimals=keep_n_decimals) # type: ignore # noqa | ||
| X = pd.DataFrame(X, columns=columns, index=index) | ||
| else: | ||
| X = transformer.fit_transform(X.to_numpy()) | ||
|
||
| if keep_n_decimals: | ||
| X = np.round(X, decimals=keep_n_decimals) # type: ignore # noqa | ||
| _, _, cudf = lazy_import_has_dependancy_cu_cat() | ||
|
||
| X = cudf.DataFrame(X, columns=columns, index=index) | ||
| return X | ||
|
|
||
|
|
||
| def impute_and_scale_df( | ||
|
|
@@ -848,6 +919,7 @@ def process_dirty_dataframes( | |
| similarity: Optional[str] = None, # "ngram", | ||
| categories: Optional[str] = "auto", | ||
| multilabel: bool = False, | ||
| feature_engine: Optional[str] = "dirty_cat", | ||
lmeyerov marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ) -> Tuple[ | ||
| pd.DataFrame, | ||
| Optional[pd.DataFrame], | ||
|
|
@@ -873,8 +945,16 @@ def process_dirty_dataframes( | |
| :return: Encoded data matrix and target (if not None), | ||
| the data encoder, and the label encoder. | ||
| """ | ||
| from dirty_cat import SuperVectorizer, GapEncoder, SimilarityEncoder | ||
| from sklearn.preprocessing import FunctionTransformer | ||
|
|
||
| if feature_engine == 'cu_cat': | ||
| lazy_import_has_dependancy_cu_cat() | ||
| from cu_cat import SuperVectorizer, GapEncoder, SimilarityEncoder | ||
| from cuml.preprocessing import FunctionTransformer | ||
|
|
||
| else: | ||
| from dirty_cat import SuperVectorizer, GapEncoder, SimilarityEncoder | ||
| from sklearn.preprocessing import FunctionTransformer | ||
|
|
||
| t = time() | ||
|
|
||
| if not is_dataframe_all_numeric(ndf): | ||
|
|
@@ -911,12 +991,19 @@ def process_dirty_dataframes( | |
| ) | ||
| # now just set the feature names, since dirty cat changes them in | ||
| # a weird way... | ||
| data_encoder.get_feature_names_out = callThrough(features_transformed) | ||
|
|
||
| X_enc = pd.DataFrame( | ||
| X_enc, columns=features_transformed, index=ndf.index | ||
| ) | ||
| X_enc = X_enc.fillna(0.0) | ||
| data_encoder.get_feature_names_out = callThrough(features_transformed) | ||
| if 'cudf' not in str(getmodule(ndf)): | ||
| X_enc = pd.DataFrame( | ||
| X_enc, columns=features_transformed, index=ndf.index | ||
| ) | ||
| X_enc = X_enc.fillna(0.0) | ||
| else: | ||
| _, _, cudf = lazy_import_has_dependancy_cu_cat() | ||
|
||
| X_enc = cudf.DataFrame( | ||
| X_enc, columns=features_transformed, index=ndf.index | ||
| ) | ||
| X_enc = X_enc.fillna(0.0).to_pandas() # will be removed for future cu_cat release | ||
|
|
||
| else: | ||
| logger.info("-*-*- DataFrame is completely numeric") | ||
| X_enc, _, data_encoder, _ = get_numeric_transformers(ndf, None) | ||
|
|
@@ -1117,7 +1204,8 @@ def process_nodes_dataframes( | |
| n_topics_target=n_topics_target, | ||
| similarity=similarity, | ||
| categories=categories, | ||
| multilabel=multilabel | ||
| multilabel=multilabel, | ||
| feature_engine=feature_engine, | ||
| ) | ||
|
|
||
| if embedding: | ||
|
|
@@ -1235,20 +1323,31 @@ def encode_edges(edf, src, dst, mlb, fit=False): | |
| """ | ||
| # uses mlb with fit=T/F so we can use it in transform mode | ||
| # to recreate edge feature concat definition | ||
| edf_type = str(getmodule(edf)) | ||
| source = edf[src] | ||
| destination = edf[dst] | ||
| source_dtype = str(getmodule(source)) | ||
| logger.debug("Encoding Edges using MultiLabelBinarizer") | ||
| if fit: | ||
| if fit and 'cudf' not in source_dtype: | ||
| T = mlb.fit_transform(zip(source, destination)) | ||
| else: | ||
| elif fit and 'cudf' in source_dtype: | ||
| T = mlb.fit_transform(zip(source.to_pandas(), destination.to_pandas())) | ||
| elif not fit and 'cudf' not in source_dtype: | ||
| T = mlb.transform(zip(source, destination)) | ||
| elif not fit and 'cudf' in source_dtype: | ||
| T = mlb.transform(zip(source.to_pandas(), destination.to_pandas())) | ||
|
|
||
| T = 1.0 * T # coerce to float | ||
| columns = [ | ||
| str(k) for k in mlb.classes_ | ||
| ] # stringify the column names or scikits.base throws error | ||
| mlb.get_feature_names_out = callThrough(columns) | ||
| mlb.columns_ = [src, dst] | ||
| T = pd.DataFrame(T, columns=columns, index=edf.index) | ||
| if 'cudf' in edf_type: | ||
| _, _, cudf = lazy_import_has_dependancy_cu_cat() | ||
| T = cudf.DataFrame(T, columns=columns, index=edf.index) | ||
| else: | ||
| T = pd.DataFrame(T, columns=columns, index=edf.index) | ||
| logger.info(f"Shape of Edge Encoding: {T.shape}") | ||
| return T, mlb | ||
|
|
||
|
|
@@ -1321,6 +1420,7 @@ def process_edge_dataframes( | |
| MultiLabelBinarizer() | ||
| ) # create new one so we can use encode_edges later in | ||
| # transform with fit=False | ||
| _, _, cudf = lazy_import_has_dependancy_cu_cat() | ||
| T, mlb_pairwise_edge_encoder = encode_edges( | ||
| edf, src, dst, mlb_pairwise_edge_encoder, fit=True | ||
| ) | ||
|
|
@@ -1406,7 +1506,11 @@ def process_edge_dataframes( | |
| if not X_enc.empty and not T.empty: | ||
| logger.debug("-" * 60) | ||
| logger.debug("<= Found Edges and Dirty_cat encoding =>") | ||
| X_enc = pd.concat([T, X_enc], axis=1) | ||
| T_type = str(getmodule(T)) | ||
| if 'cudf' in T_type: | ||
| X_enc = cudf.concat([T, X_enc], axis=1) | ||
| else: | ||
| X_enc = pd.concat([T, X_enc], axis=1) | ||
| elif not T.empty and X_enc.empty: | ||
| logger.debug("-" * 60) | ||
| logger.debug("<= Found only Edges =>") | ||
|
|
@@ -1811,7 +1915,7 @@ def prune_weighted_edges_df_and_relabel_nodes( | |
| " -- Pruning weighted edge DataFrame " | ||
| f"from {len(wdf):,} to {len(wdf2):,} edges." | ||
| ) | ||
| if index_to_nodes_dict is not None: | ||
| if index_to_nodes_dict is not None and type(index_to_nodes_dict) == dict: | ||
| wdf2[config.SRC] = wdf2[config.SRC].map(index_to_nodes_dict) | ||
| wdf2[config.DST] = wdf2[config.DST].map(index_to_nodes_dict) | ||
| return wdf2 | ||
|
|
@@ -1952,7 +2056,8 @@ def _featurize_nodes( | |
| X_resolved = resolve_X(ndf, X) | ||
| y_resolved = resolve_y(ndf, y) | ||
|
|
||
| feature_engine = resolve_feature_engine(feature_engine) | ||
| res.feature_engine = feature_engine | ||
dcolinmorgan marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| X_resolved, y_resolved = make_safe_gpu_dataframes(X_resolved, y_resolved, engine=feature_engine) | ||
|
|
||
| from .features import ModelDict | ||
|
|
||
|
|
@@ -2076,6 +2181,9 @@ def _featurize_edges( | |
| **{res._destination: res._edges[res._destination]} | ||
| ) | ||
|
|
||
| res.feature_engine = feature_engine | ||
dcolinmorgan marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| X_resolved, y_resolved = make_safe_gpu_dataframes(X_resolved, y_resolved, engine=feature_engine) | ||
|
|
||
| # now that everything is set | ||
| fkwargs = dict( | ||
| X=X_resolved, | ||
|
|
@@ -2487,13 +2595,18 @@ def featurize( | |
| default True. | ||
| :return: graphistry instance with new attributes set by the featurization process. | ||
| """ | ||
| assert_imported() | ||
| feature_engine = resolve_feature_engine(feature_engine) | ||
|
|
||
| if feature_engine == 'dirty_cat': | ||
| assert_imported() | ||
dcolinmorgan marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| elif feature_engine == 'cu_cat': | ||
| assert_cuml_cucat() | ||
|
|
||
| if inplace: | ||
| res = self | ||
| else: | ||
| res = self.bind() | ||
|
|
||
| feature_engine = resolve_feature_engine(feature_engine) | ||
|
|
||
| if kind == "nodes": | ||
| res = res._featurize_nodes( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add
assert cudf is not None?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also probably good to switch from
lazy_import...cu_catto acudfoneThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok -- after the if statement seems best here again like other assert you mentioned