electronic_warefare_hackathon/cluster_eval.py at master · adaykoth/electronic_warefare_hackathon · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import sys
from pathlib import Path
import numpy as np
import polars as pl
import pandas as pd
import pyarrow as pa
import pyarrow.ipc as ipc

from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture

from parse_data import load_window


def cluster_unlabeled_data(
    df: pl.DataFrame,
    varlist: list[str],
    n_components: int = 20,
    covariance_type: str = "full",  # Options: 'full', 'diag', 'tied', or 'spherical'
    reg_covar: float = 1e-6,        # Regularization term for covariance matrices
    n_init: int = 1,                # Number of initializations
    max_iter: int = 100,            # Maximum number of iterations
    random_state: int = 0,
):
    """
    Cluster the entire unlabeled dataset using a Gaussian Mixture Model (GMM)
    and add a new column "label" with the cluster assignments.

    Parameters:
        df (pl.DataFrame): Input unlabeled Polars DataFrame.
        varlist (list[str]): List of feature names to use for clustering.
        n_components (int): Number of mixture components (clusters).
        covariance_type (str): Covariance type for the GMM.
        reg_covar (float): Regularization parameter added to covariance matrices.
        n_init (int): Number of initializations.
        max_iter (int): Maximum number of iterations for convergence.
        random_state (int): Random state for reproducibility.

    Returns:
        pd.DataFrame: A Pandas DataFrame with a new column "label" containing cluster labels.
    """
    # Convert the Polars DataFrame to a Pandas DataFrame.
    df_pd = df.to_pandas()

    # Extract features.
    X = df_pd[varlist]

    # Standardize features.
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Configure and apply the Gaussian Mixture Model.
    gmm = GaussianMixture(
        n_components=n_components,
        covariance_type=covariance_type,
        reg_covar=reg_covar,
        n_init=n_init,
        max_iter=max_iter,
        random_state=random_state,
    )
    gmm.fit(X_scaled)
    labels = gmm.predict(X_scaled)

    # Add the new "label" column to the DataFrame.
    df_pd["label"] = labels
    print("GMM clustering completed. 'label' column added.")

    return df_pd


if __name__ == "__main__":
    # Expect two command-line arguments:
    #   1. The input data file path.
    #   2. The output file path where the new DataFrame should be saved as a .ipc file.
    data_file = Path(sys.argv[1])
    output_file = Path(sys.argv[2])

    # Load the data using your existing load_window function.
    df = load_window(data_file)

    # Define the feature list to use for clustering.
    feature_list = [
        "amplitude",
        "frequency",
        "pulse_width",
    ]

    # Run the clustering algorithm on the entire dataset.
    clustered_df = cluster_unlabeled_data(
        df,
        feature_list,
        n_components=150,
        covariance_type="full",
        reg_covar=1e-6,
        n_init=2,
        max_iter=50,
        random_state=0,
    )

    table = pa.Table.from_pandas(clustered_df)

    # Save the Arrow Table to an IPC file using RecordBatchFileWriter.
    with pa.OSFile(str(output_file), "wb") as sink:
        with ipc.RecordBatchFileWriter(sink, table.schema) as writer:
            writer.write_table(table)
    print(f"Clustered data saved as IPC file to {output_file}")