Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 30 additions & 8 deletions aaanalysis/feature_engineering/_backend/cpp/utils_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,17 +186,39 @@ def get_list_parts(features=None):
return list_parts


def get_df_parts_(df_seq=None, list_parts=None, jmd_n_len=None, jmd_c_len=None):
"""Create DataFrame with sequence parts"""
def get_df_parts_(df_seq=None, list_parts=None, jmd_n_len=None, jmd_c_len=None, handle_duplicates=None):
"""Create DataFrame with sequence parts, ensuring unique index using serial number if duplicates are present."""

# Check if the required column exists in the DataFrame
pos_based = set(ut.COLS_SEQ_POS).issubset(set(df_seq))

# Copy the original DataFrame to avoid altering it
_df_seq = df_seq.copy()

# Extract parts for each row in the DataFrame
_df_seq['parts'] = df_seq.apply(lambda row:
_extract_parts(row, jmd_n_len, jmd_c_len, pos_based, list_parts), axis=1)
# Convert the extracted parts into a DataFrame
df_parts = pd.DataFrame(_df_seq['parts'].to_list(),
index=df_seq[ut.COL_ENTRY])
# DEV: the following line sorts index if list_parts contains just one element
# df_parts = pd.DataFrame.from_dict(dict_parts, orient="index")

# Check for duplicates in the specified column (COL_ENTRY)
duplicate_entries = _df_seq[ut.COL_ENTRY].duplicated()

if duplicate_entries.any():
# If the user wants to handle duplicates, proceed as specified
if handle_duplicates is not None:
if handle_duplicates:
print(f"Handling duplicates as per user input: {handle_duplicates}")
# Create a unique identifier by adding a serial number to duplicates
_df_seq[ut.COL_ENTRY] = _df_seq.groupby(ut.COL_ENTRY).cumcount().astype(str) + "_" + _df_seq[ut.COL_ENTRY]
else:
print("Duplicates found but not handling them as 'handle_duplicates' is False.")
else:
# If no user input is provided, issue a warning and handle duplicates by adding a serial number
print(f"Warning: Duplicates found in '{ut.COL_ENTRY}'. Adding serial number to make them unique.")
_df_seq[ut.COL_ENTRY] = _df_seq.groupby(ut.COL_ENTRY).cumcount().astype(str) + "_" + _df_seq[ut.COL_ENTRY]

# Convert the 'parts' into a DataFrame with the unique entries as the index
df_parts = pd.DataFrame(_df_seq['parts'].to_list(), index=_df_seq[ut.COL_ENTRY])

return df_parts


Expand Down Expand Up @@ -314,4 +336,4 @@ def add_scale_info_(df_feat=None, df_cat=None):
dict_cat = dict(zip(df_cat[ut.COL_SCALE_ID], df_cat[col]))
vals = [dict_cat[s.split("-")[2]] for s in df_feat[ut.COL_FEATURE]]
df_feat.insert(i + 1, col, vals)
return df_feat
return df_feat
2 changes: 1 addition & 1 deletion tutorials/tutorial1_quick_start.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@
}
],
"source": [
"# CPP creates around 100.000 features and filters them down to 100\n",
"# CPP creates around 100,000 features and filters them down to 100\n",
"df_parts = sf.get_df_parts(df_seq=df_seq)\n",
"cpp = aa.CPP(df_scales=df_scales, df_parts=df_parts)\n",
"df_feat = cpp.run(labels=labels)\n",
Expand Down
Loading