Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions ML Projects/data_preprocessing_app/open-source/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Data Preprocessing Web App

A simple and interactive Streamlit-based web app to **preview**, **clean**, **visualize**, and **transform** your datasets — no coding required!

---

## 🔧 Features

- Upload datasets in CSV, Excel, JSON, SQL, or Parquet format
-Preview dataset (head & tail)
- Clean data:
- Drop columns
- Handle missing values
- Remove non-alphanumeric characters
- Replace substrings in address-like fields
- Visualize numeric data:
- Boxplots
- Histograms
-Transform data:
- Label Encoding
- One-Hot Encoding (Pandas and Scikit-learn)
- Standard Scaling
- Anomaly removal using Z-score

---

## How to Run

### 1. Clone the repository

## bash

git clone https://github.com/agentksimha/data-preprocessing-app.git

cd data-preprocessing-app
Empty file.
118 changes: 118 additions & 0 deletions ML Projects/data_preprocessing_app/open-source/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import streamlit as st
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import sys
import os

# Ensure module path and import your data_tool
sys.path.append(os.path.dirname(__file__))
from data_tool import read_data, preview, preprocess_data, data_metrics_and_visualise

st.set_page_config(page_title="Data Toolkit", layout="wide")
st.title("🧠 Data Preprocessing Web App")

uploaded_file = st.file_uploader("Upload a dataset", type=["csv", "xlsx", "json", "sql", "parquet"])

if uploaded_file:
with open("temp_upload" + Path(uploaded_file.name).suffix, "wb") as f:
f.write(uploaded_file.getvalue())
filepath = f.name

st.success(f"Loaded file: {uploaded_file.name}")
task = st.selectbox("Choose Task", ["Preview", "Clean", "Visualize", "Transform"])

if "processed_df" not in st.session_state:
st.session_state.processed_df = read_data(filepath)

if task == "Preview":
head, tail = preview(filepath)
st.subheader("Head")
st.write(head)
st.subheader("Tail")
st.write(tail)

csv = st.session_state.processed_df.to_csv(index=False).encode('utf-8')
st.download_button("📥 Download Dataset", csv, "original_data.csv", "text/csv")

elif task == "Clean":
st.subheader("Cleaning Options")
df = st.session_state.processed_df
drop_cols = st.multiselect("Columns to Drop", df.columns.tolist())
drop_null = st.checkbox("Drop Null Values")
fill_zero = st.checkbox("Fill Null with Zero")
column_to_clean = st.selectbox("Column to Clean (remove non-alphanum)", [None] + df.columns.tolist())
address_col = st.selectbox("Address Column (Replace String)", [None] + df.columns.tolist())
old_str = st.text_input("Old String to Replace")
new_str = st.text_input("New String Value")

if st.button("Clean Data"):
st.session_state.processed_df = preprocess_data(
st.session_state.processed_df,
drop_cols=drop_cols,
drop_null=drop_null,
fill_null_with_zero=fill_zero,
column_to_clean=column_to_clean if column_to_clean != "None" else None,
address_col_to_standardize=address_col if address_col != "None" else None,
old_str=old_str or None,
new_str=new_str or None,
)
st.dataframe(st.session_state.processed_df.head())

csv = st.session_state.processed_df.to_csv(index=False).encode('utf-8')
st.download_button("📥 Download Cleaned Data", csv, "cleaned_data.csv", "text/csv")

elif task == "Visualize":
st.subheader("Visualization Options")
df = st.session_state.processed_df

numeric_columns = df.select_dtypes(include=["number"]).columns.tolist()
selected_columns = st.multiselect("Select numeric columns to visualize", numeric_columns, default=numeric_columns)

chart_type = st.selectbox("Chart Type", ["Boxplot", "Histogram"])

if st.button("Show Plots"):
if not selected_columns:
st.warning("Please select at least one numeric column to visualize.")
else:
with st.spinner("Generating plots..."):
for col in selected_columns:
st.write(f"### {chart_type} for `{col}`")

fig, ax = plt.subplots()
if chart_type == "Boxplot":
ax.boxplot(df[col].dropna(), vert=False)
elif chart_type == "Histogram":
ax.hist(df[col].dropna(), bins=30)

ax.set_title(f"{chart_type} for {col}")
st.pyplot(fig)
plt.clf()

csv = df[selected_columns].to_csv(index=False).encode('utf-8')
st.download_button("📥 Download Visualized Data", csv, "visualized_data.csv", "text/csv")


elif task == "Transform":
st.subheader("Transformation Options")
df = st.session_state.processed_df
label_cols = st.multiselect("Label Encode Columns", df.select_dtypes(include="object").columns.tolist())
one_hot_cols = st.multiselect("One-Hot Encode (Pandas)", df.select_dtypes(include="object").columns.tolist())
one_hot_sklearn_cols = st.multiselect("One-Hot Encode (Scikit-learn)", df.select_dtypes(include="object").columns.tolist())
scale = st.checkbox("Standard Scale Data")
anomaly_col = st.selectbox("Column for Anomaly Removal", [None] + df.select_dtypes(include="number").columns.tolist())

if st.button("Transform Data"):
st.session_state.processed_df = preprocess_data(
st.session_state.processed_df,
label_encode_cols=label_cols,
one_hot_encode_cols=one_hot_cols,
one_hot_encode_cols_sklearn=one_hot_sklearn_cols,
scale=scale,
anomaly_col_train=anomaly_col if anomaly_col != "None" else None,
)

st.dataframe(st.session_state.processed_df.head())

csv = st.session_state.processed_df.to_csv(index=False).encode('utf-8')
st.download_button("📥 Download Transformed Data", csv, "transformed_data.csv", "text/csv")
129 changes: 129 additions & 0 deletions ML Projects/data_preprocessing_app/open-source/data_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# data_tool.py
import pandas as pd
import numpy as np
import re
import sqlite3
from pathlib import Path
from typing import Optional, List, Tuple, Union
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

# Unified preprocessing function

def preprocess_data(
filepath_or_df: Union[str, pd.DataFrame],
drop_cols: Optional[List[str]] = None,
drop_null: bool = False,
fill_null_with_zero: bool = False,
column_to_clean: Optional[str] = None,
address_col_to_standardize: Optional[str] = None,
old_str: Optional[str] = None,
new_str: Optional[str] = None,
label_encode_cols: Optional[List[str]] = None,
one_hot_encode_cols: Optional[List[str]] = None,
one_hot_encode_cols_sklearn: Optional[List[str]] = None,
scale: bool = False,
axis_concat: Optional[int] = None,
concat_cols: Optional[Tuple[Union[str, int], Union[str, int]]] = None,
anomaly_col_train: Optional[str] = None,
) -> pd.DataFrame:
# ✅ Accept either a filepath or a DataFrame
if isinstance(filepath_or_df, pd.DataFrame):
df = filepath_or_df.copy()
else:
df = read_data(filepath_or_df)

df.drop_duplicates(inplace=True)

if drop_cols:
df.drop(columns=drop_cols, inplace=True)
if drop_null:
df.dropna(inplace=True)
if fill_null_with_zero:
df.fillna(0, inplace=True)
if column_to_clean:
df[column_to_clean] = df[column_to_clean].apply(
lambda x: re.sub(r"[^a-zA-Z0-9]", "", str(x)) if pd.notnull(x) else x
)
if address_col_to_standardize and old_str is not None and new_str is not None:
df[address_col_to_standardize] = df[address_col_to_standardize].apply(
lambda x: str(x).replace(old_str, new_str) if pd.notnull(x) else x
)

if label_encode_cols:
le = LabelEncoder()
for col in label_encode_cols:
if col in df.columns:
df[col] = le.fit_transform(df[col].astype(str))

if one_hot_encode_cols_sklearn:
oe = OneHotEncoder(sparse_output=False)
for col in one_hot_encode_cols_sklearn:
encoded = oe.fit_transform(df[[col]])
df = df.drop(columns=[col])
df[oe.get_feature_names_out([col])] = encoded

if one_hot_encode_cols:
df = pd.get_dummies(df, columns=one_hot_encode_cols)

if scale:
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

if anomaly_col_train:
scaler = StandardScaler()
df[anomaly_col_train] = scaler.fit_transform(df[[anomaly_col_train]])
df = df[(df[anomaly_col_train] <= 3) & (df[anomaly_col_train] >= -3)]

if concat_cols and axis_concat is not None:
df = pd.concat(
[df[[concat_cols[0]]], df[[concat_cols[1]]]],
axis=axis_concat,
ignore_index=True,
)

return df



def read_data(filepath: str) -> pd.DataFrame:
suffix = Path(filepath).suffix.lower()
if suffix == ".csv":
return pd.read_csv(filepath)
elif suffix == ".xlsx":
return pd.read_excel(filepath)
elif suffix == ".sql":
conn = sqlite3.connect(filepath)
df = pd.read_sql("SELECT * FROM table_name", conn)
conn.close()
return df
elif suffix == ".json":
return pd.read_json(filepath)
elif suffix == ".parquet":
return pd.read_parquet(filepath)
else:
raise ValueError(f"Unsupported file format: {suffix}")

def preview(filepath: str):
df = read_data(filepath)
return df.head(), df.tail()

def data_metrics_and_visualise(filepath: str, s: Optional[str] = None):
import matplotlib.pyplot as plt
df = read_data(filepath)
numeric_df = df.select_dtypes(include=[np.number])
mat = numeric_df.to_numpy().T

if s == "all_numeric_data":
for row in mat:
plt.boxplot(row)
plt.title("Boxplot")
plt.show()

for row in mat:
plt.hist(row, bins=30)
plt.title("Histogram")
plt.show()

df.info()
return df.describe(), df.head(), df.corr(), df.median(), df.mode()

Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
streamlit
pandas
numpy
matplotlib
openpyxl
scikit-learn