From 46e3640e43bb03cd72ddd5c6a2fd78fdbe652ff9 Mon Sep 17 00:00:00 2001 From: Krishna Simha Date: Sat, 19 Jul 2025 15:52:50 +0530 Subject: [PATCH 1/2] Added Streamlit preprocessing web app --- data_preprocessing_app/open-source/README.md | 35 +++++ .../open-source/__init__.py | 0 data_preprocessing_app/open-source/app.py | 118 ++++++++++++++++ .../open-source/data_tool.py | 129 ++++++++++++++++++ .../open-source/requirements.txt | 6 + 5 files changed, 288 insertions(+) create mode 100644 data_preprocessing_app/open-source/README.md create mode 100644 data_preprocessing_app/open-source/__init__.py create mode 100644 data_preprocessing_app/open-source/app.py create mode 100644 data_preprocessing_app/open-source/data_tool.py create mode 100644 data_preprocessing_app/open-source/requirements.txt diff --git a/data_preprocessing_app/open-source/README.md b/data_preprocessing_app/open-source/README.md new file mode 100644 index 000000000..8db0384d4 --- /dev/null +++ b/data_preprocessing_app/open-source/README.md @@ -0,0 +1,35 @@ +# Data Preprocessing Web App + +A simple and interactive Streamlit-based web app to **preview**, **clean**, **visualize**, and **transform** your datasets — no coding required! + +--- + +## 🔧 Features + +- Upload datasets in CSV, Excel, JSON, SQL, or Parquet format +-Preview dataset (head & tail) +- Clean data: + - Drop columns + - Handle missing values + - Remove non-alphanumeric characters + - Replace substrings in address-like fields +- Visualize numeric data: + - Boxplots + - Histograms +-Transform data: + - Label Encoding + - One-Hot Encoding (Pandas and Scikit-learn) + - Standard Scaling + - Anomaly removal using Z-score + +--- + +## How to Run + +### 1. Clone the repository + +## bash + +git clone https://github.com/agentksimha/data-preprocessing-app.git + +cd data-preprocessing-app diff --git a/data_preprocessing_app/open-source/__init__.py b/data_preprocessing_app/open-source/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/data_preprocessing_app/open-source/app.py b/data_preprocessing_app/open-source/app.py new file mode 100644 index 000000000..8679d9119 --- /dev/null +++ b/data_preprocessing_app/open-source/app.py @@ -0,0 +1,118 @@ +import streamlit as st +import pandas as pd +from pathlib import Path +import matplotlib.pyplot as plt +import sys +import os + +# Ensure module path and import your data_tool +sys.path.append(os.path.dirname(__file__)) +from data_tool import read_data, preview, preprocess_data, data_metrics_and_visualise + +st.set_page_config(page_title="Data Toolkit", layout="wide") +st.title("🧠 Data Preprocessing Web App") + +uploaded_file = st.file_uploader("Upload a dataset", type=["csv", "xlsx", "json", "sql", "parquet"]) + +if uploaded_file: + with open("temp_upload" + Path(uploaded_file.name).suffix, "wb") as f: + f.write(uploaded_file.getvalue()) + filepath = f.name + + st.success(f"Loaded file: {uploaded_file.name}") + task = st.selectbox("Choose Task", ["Preview", "Clean", "Visualize", "Transform"]) + + if "processed_df" not in st.session_state: + st.session_state.processed_df = read_data(filepath) + + if task == "Preview": + head, tail = preview(filepath) + st.subheader("Head") + st.write(head) + st.subheader("Tail") + st.write(tail) + + csv = st.session_state.processed_df.to_csv(index=False).encode('utf-8') + st.download_button("📥 Download Dataset", csv, "original_data.csv", "text/csv") + + elif task == "Clean": + st.subheader("Cleaning Options") + df = st.session_state.processed_df + drop_cols = st.multiselect("Columns to Drop", df.columns.tolist()) + drop_null = st.checkbox("Drop Null Values") + fill_zero = st.checkbox("Fill Null with Zero") + column_to_clean = st.selectbox("Column to Clean (remove non-alphanum)", [None] + df.columns.tolist()) + address_col = st.selectbox("Address Column (Replace String)", [None] + df.columns.tolist()) + old_str = st.text_input("Old String to Replace") + new_str = st.text_input("New String Value") + + if st.button("Clean Data"): + st.session_state.processed_df = preprocess_data( + st.session_state.processed_df, + drop_cols=drop_cols, + drop_null=drop_null, + fill_null_with_zero=fill_zero, + column_to_clean=column_to_clean if column_to_clean != "None" else None, + address_col_to_standardize=address_col if address_col != "None" else None, + old_str=old_str or None, + new_str=new_str or None, + ) + st.dataframe(st.session_state.processed_df.head()) + + csv = st.session_state.processed_df.to_csv(index=False).encode('utf-8') + st.download_button("📥 Download Cleaned Data", csv, "cleaned_data.csv", "text/csv") + + elif task == "Visualize": + st.subheader("Visualization Options") + df = st.session_state.processed_df + + numeric_columns = df.select_dtypes(include=["number"]).columns.tolist() + selected_columns = st.multiselect("Select numeric columns to visualize", numeric_columns, default=numeric_columns) + + chart_type = st.selectbox("Chart Type", ["Boxplot", "Histogram"]) + + if st.button("Show Plots"): + if not selected_columns: + st.warning("Please select at least one numeric column to visualize.") + else: + with st.spinner("Generating plots..."): + for col in selected_columns: + st.write(f"### {chart_type} for `{col}`") + + fig, ax = plt.subplots() + if chart_type == "Boxplot": + ax.boxplot(df[col].dropna(), vert=False) + elif chart_type == "Histogram": + ax.hist(df[col].dropna(), bins=30) + + ax.set_title(f"{chart_type} for {col}") + st.pyplot(fig) + plt.clf() + + csv = df[selected_columns].to_csv(index=False).encode('utf-8') + st.download_button("📥 Download Visualized Data", csv, "visualized_data.csv", "text/csv") + + + elif task == "Transform": + st.subheader("Transformation Options") + df = st.session_state.processed_df + label_cols = st.multiselect("Label Encode Columns", df.select_dtypes(include="object").columns.tolist()) + one_hot_cols = st.multiselect("One-Hot Encode (Pandas)", df.select_dtypes(include="object").columns.tolist()) + one_hot_sklearn_cols = st.multiselect("One-Hot Encode (Scikit-learn)", df.select_dtypes(include="object").columns.tolist()) + scale = st.checkbox("Standard Scale Data") + anomaly_col = st.selectbox("Column for Anomaly Removal", [None] + df.select_dtypes(include="number").columns.tolist()) + + if st.button("Transform Data"): + st.session_state.processed_df = preprocess_data( + st.session_state.processed_df, + label_encode_cols=label_cols, + one_hot_encode_cols=one_hot_cols, + one_hot_encode_cols_sklearn=one_hot_sklearn_cols, + scale=scale, + anomaly_col_train=anomaly_col if anomaly_col != "None" else None, + ) + + st.dataframe(st.session_state.processed_df.head()) + + csv = st.session_state.processed_df.to_csv(index=False).encode('utf-8') + st.download_button("📥 Download Transformed Data", csv, "transformed_data.csv", "text/csv") diff --git a/data_preprocessing_app/open-source/data_tool.py b/data_preprocessing_app/open-source/data_tool.py new file mode 100644 index 000000000..816db027b --- /dev/null +++ b/data_preprocessing_app/open-source/data_tool.py @@ -0,0 +1,129 @@ +# data_tool.py +import pandas as pd +import numpy as np +import re +import sqlite3 +from pathlib import Path +from typing import Optional, List, Tuple, Union +from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder + +# Unified preprocessing function + +def preprocess_data( + filepath_or_df: Union[str, pd.DataFrame], + drop_cols: Optional[List[str]] = None, + drop_null: bool = False, + fill_null_with_zero: bool = False, + column_to_clean: Optional[str] = None, + address_col_to_standardize: Optional[str] = None, + old_str: Optional[str] = None, + new_str: Optional[str] = None, + label_encode_cols: Optional[List[str]] = None, + one_hot_encode_cols: Optional[List[str]] = None, + one_hot_encode_cols_sklearn: Optional[List[str]] = None, + scale: bool = False, + axis_concat: Optional[int] = None, + concat_cols: Optional[Tuple[Union[str, int], Union[str, int]]] = None, + anomaly_col_train: Optional[str] = None, +) -> pd.DataFrame: + # ✅ Accept either a filepath or a DataFrame + if isinstance(filepath_or_df, pd.DataFrame): + df = filepath_or_df.copy() + else: + df = read_data(filepath_or_df) + + df.drop_duplicates(inplace=True) + + if drop_cols: + df.drop(columns=drop_cols, inplace=True) + if drop_null: + df.dropna(inplace=True) + if fill_null_with_zero: + df.fillna(0, inplace=True) + if column_to_clean: + df[column_to_clean] = df[column_to_clean].apply( + lambda x: re.sub(r"[^a-zA-Z0-9]", "", str(x)) if pd.notnull(x) else x + ) + if address_col_to_standardize and old_str is not None and new_str is not None: + df[address_col_to_standardize] = df[address_col_to_standardize].apply( + lambda x: str(x).replace(old_str, new_str) if pd.notnull(x) else x + ) + + if label_encode_cols: + le = LabelEncoder() + for col in label_encode_cols: + if col in df.columns: + df[col] = le.fit_transform(df[col].astype(str)) + + if one_hot_encode_cols_sklearn: + oe = OneHotEncoder(sparse_output=False) + for col in one_hot_encode_cols_sklearn: + encoded = oe.fit_transform(df[[col]]) + df = df.drop(columns=[col]) + df[oe.get_feature_names_out([col])] = encoded + + if one_hot_encode_cols: + df = pd.get_dummies(df, columns=one_hot_encode_cols) + + if scale: + scaler = StandardScaler() + df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns) + + if anomaly_col_train: + scaler = StandardScaler() + df[anomaly_col_train] = scaler.fit_transform(df[[anomaly_col_train]]) + df = df[(df[anomaly_col_train] <= 3) & (df[anomaly_col_train] >= -3)] + + if concat_cols and axis_concat is not None: + df = pd.concat( + [df[[concat_cols[0]]], df[[concat_cols[1]]]], + axis=axis_concat, + ignore_index=True, + ) + + return df + + + +def read_data(filepath: str) -> pd.DataFrame: + suffix = Path(filepath).suffix.lower() + if suffix == ".csv": + return pd.read_csv(filepath) + elif suffix == ".xlsx": + return pd.read_excel(filepath) + elif suffix == ".sql": + conn = sqlite3.connect(filepath) + df = pd.read_sql("SELECT * FROM table_name", conn) + conn.close() + return df + elif suffix == ".json": + return pd.read_json(filepath) + elif suffix == ".parquet": + return pd.read_parquet(filepath) + else: + raise ValueError(f"Unsupported file format: {suffix}") + +def preview(filepath: str): + df = read_data(filepath) + return df.head(), df.tail() + +def data_metrics_and_visualise(filepath: str, s: Optional[str] = None): + import matplotlib.pyplot as plt + df = read_data(filepath) + numeric_df = df.select_dtypes(include=[np.number]) + mat = numeric_df.to_numpy().T + + if s == "all_numeric_data": + for row in mat: + plt.boxplot(row) + plt.title("Boxplot") + plt.show() + + for row in mat: + plt.hist(row, bins=30) + plt.title("Histogram") + plt.show() + + df.info() + return df.describe(), df.head(), df.corr(), df.median(), df.mode() + diff --git a/data_preprocessing_app/open-source/requirements.txt b/data_preprocessing_app/open-source/requirements.txt new file mode 100644 index 000000000..48fb13a0e --- /dev/null +++ b/data_preprocessing_app/open-source/requirements.txt @@ -0,0 +1,6 @@ +streamlit +pandas +numpy +matplotlib +openpyxl +scikit-learn From 96ad737c9cdf9aa337a2e0a1a355abb8a98b9a37 Mon Sep 17 00:00:00 2001 From: Krishna Simha Date: Sat, 19 Jul 2025 15:58:12 +0530 Subject: [PATCH 2/2] Moved data_preprocessing_app to ML Projects/ --- .../data_preprocessing_app}/open-source/README.md | 0 .../data_preprocessing_app}/open-source/__init__.py | 0 .../data_preprocessing_app}/open-source/app.py | 0 .../data_preprocessing_app}/open-source/data_tool.py | 0 .../data_preprocessing_app}/open-source/requirements.txt | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename {data_preprocessing_app => ML Projects/data_preprocessing_app}/open-source/README.md (100%) rename {data_preprocessing_app => ML Projects/data_preprocessing_app}/open-source/__init__.py (100%) rename {data_preprocessing_app => ML Projects/data_preprocessing_app}/open-source/app.py (100%) rename {data_preprocessing_app => ML Projects/data_preprocessing_app}/open-source/data_tool.py (100%) rename {data_preprocessing_app => ML Projects/data_preprocessing_app}/open-source/requirements.txt (100%) diff --git a/data_preprocessing_app/open-source/README.md b/ML Projects/data_preprocessing_app/open-source/README.md similarity index 100% rename from data_preprocessing_app/open-source/README.md rename to ML Projects/data_preprocessing_app/open-source/README.md diff --git a/data_preprocessing_app/open-source/__init__.py b/ML Projects/data_preprocessing_app/open-source/__init__.py similarity index 100% rename from data_preprocessing_app/open-source/__init__.py rename to ML Projects/data_preprocessing_app/open-source/__init__.py diff --git a/data_preprocessing_app/open-source/app.py b/ML Projects/data_preprocessing_app/open-source/app.py similarity index 100% rename from data_preprocessing_app/open-source/app.py rename to ML Projects/data_preprocessing_app/open-source/app.py diff --git a/data_preprocessing_app/open-source/data_tool.py b/ML Projects/data_preprocessing_app/open-source/data_tool.py similarity index 100% rename from data_preprocessing_app/open-source/data_tool.py rename to ML Projects/data_preprocessing_app/open-source/data_tool.py diff --git a/data_preprocessing_app/open-source/requirements.txt b/ML Projects/data_preprocessing_app/open-source/requirements.txt similarity index 100% rename from data_preprocessing_app/open-source/requirements.txt rename to ML Projects/data_preprocessing_app/open-source/requirements.txt