Merge pull request #96 from kananmmehta/add-py-vis-module

namish18 · web-flow · commit f4609c59157f · 2025-12-06T19:55:13.000+05:30
adding python data-vis module
diff --git a/tools/ecom-analytics/Dockerfile b/tools/ecom-analytics/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 8501
+CMD ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
diff --git a/tools/ecom-analytics/data/ecommerce_sample.csv b/tools/ecom-analytics/data/ecommerce_sample.csv
@@ -0,0 +1,11 @@
+order_id,order_date,category,product,quantity,revenue,customer_type
+1001,2024-01-05,Electronics,Headphones,2,400,New
+1002,2024-01-12,Clothing,T-Shirt,3,75,Returning
+1003,2024-02-03,Electronics,Keyboard,1,120,Returning
+1004,2024-02-15,Home,Blender,1,220,New
+1005,2024-03-01,Clothing,Jacket,1,150,Returning
+1006,2024-03-18,Home,Vacuum Cleaner,1,500,New
+1007,2024-03-22,Electronics,Mouse,4,160,Returning
+1008,2024-04-05,Clothing,Shoes,2,300,New
+1009,2024-04-19,Electronics,Monitor,1,350,Returning
+1010,2024-05-10,Home,Coffee Maker,1,180,New
diff --git a/tools/ecom-analytics/notebooks/analysis.ipynb b/tools/ecom-analytics/notebooks/analysis.ipynb
@@ -0,0 +1,213 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f00ac002",
+   "metadata": {},
+   "source": [
+    "📊 E-Commerce Data Analysis & Visualization\n",
+    "Python Practice Challenge\n",
+    "🧠 Objective\n",
+    "\n",
+    "This notebook helps you practice real-world Python data analysis skills by working with a small e-commerce dataset.\n",
+    "You will:\n",
+    "\n",
+    "Load and explore data\n",
+    "\n",
+    "Analyze trends\n",
+    "\n",
+    "Create meaningful visualizations\n",
+    "\n",
+    "Draw business insights\n",
+    "\n",
+    "✅ Skills Practiced\n",
+    "\n",
+    "Pandas (data manipulation)\n",
+    "\n",
+    "Matplotlib & Seaborn (visualization)\n",
+    "\n",
+    "Exploratory Data Analysis (EDA)\n",
+    "\n",
+    "Thinking like a data analyst 🎯\n",
+    "\n",
+    "📁 Dataset\n",
+    "\n",
+    "We use a sample e-commerce dataset containing:\n",
+    "\n",
+    "Orders\n",
+    "\n",
+    "Dates\n",
+    "\n",
+    "Categories\n",
+    "\n",
+    "Customers\n",
+    "\n",
+    "Revenue\n",
+    "\n",
+    "🔹 Cell 1: Import Libraries\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "# visualization settings\n",
+    "sns.set(style=\"whitegrid\")\n",
+    "plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
+    "\n",
+    "🔹 Cell 2: Load Dataset\n",
+    "\n",
+    "If you're running this via Binder or locally, this path works.\n",
+    "If using Colab, upload the CSV manually.\n",
+    "\n",
+    "df = pd.read_csv(\"../../public/data/ecommerce_sample.csv\")\n",
+    "df.head()\n",
+    "\n",
+    "🔹 Cell 3: Basic Dataset Info\n",
+    "df.info()\n",
+    "\n",
+    "df.describe()\n",
+    "\n",
+    "🔹 Cell 4: Datetime Conversion\n",
+    "df['order_date'] = pd.to_datetime(df['order_date'])\n",
+    "\n",
+    "df.dtypes\n",
+    "\n",
+    "\n",
+    "✅ This allows time-based analysis.\n",
+    "\n",
+    "🔹 Cell 5: Monthly Revenue Trend\n",
+    "\n",
+    "📈 Question: How does revenue change over time?\n",
+    "\n",
+    "monthly_revenue = (\n",
+    "    df\n",
+    "    .groupby(df['order_date'].dt.to_period('M'))['revenue']\n",
+    "    .sum()\n",
+    ")\n",
+    "\n",
+    "monthly_revenue.plot(\n",
+    "    kind='line',\n",
+    "    marker='o',\n",
+    "    title='Monthly Revenue Trend'\n",
+    ")\n",
+    "\n",
+    "plt.xlabel(\"Month\")\n",
+    "plt.ylabel(\"Total Revenue\")\n",
+    "plt.show()\n",
+    "\n",
+    "✅ Insight\n",
+    "\n",
+    "Identify peak sales months\n",
+    "\n",
+    "Observe growth or decline trends\n",
+    "\n",
+    "🔹 Cell 6: Revenue by Category\n",
+    "\n",
+    "📊 Question: Which product category generates the most revenue?\n",
+    "\n",
+    "category_sales = (\n",
+    "    df.groupby('category')['revenue']\n",
+    "    .sum()\n",
+    "    .sort_values(ascending=False)\n",
+    ")\n",
+    "\n",
+    "sns.barplot(\n",
+    "    x=category_sales.index,\n",
+    "    y=category_sales.values\n",
+    ")\n",
+    "plt.title(\"Revenue by Product Category\")\n",
+    "plt.xlabel(\"Category\")\n",
+    "plt.ylabel(\"Revenue\")\n",
+    "plt.show()\n",
+    "\n",
+    "✅ Insight\n",
+    "\n",
+    "Focus marketing on top-performing categories\n",
+    "\n",
+    "Spot underperforming ones\n",
+    "\n",
+    "🔹 Cell 7: Top Customers\n",
+    "\n",
+    "👥 Question: Who are the highest-value customers?\n",
+    "\n",
+    "customer_sales = (\n",
+    "    df.groupby('customer')['revenue']\n",
+    "    .sum()\n",
+    "    .sort_values(ascending=False)\n",
+    ")\n",
+    "\n",
+    "customer_sales\n",
+    "\n",
+    "🔹 Cell 8: Customer Revenue Share (Pie Chart)\n",
+    "customer_sales.plot(\n",
+    "    kind='pie',\n",
+    "    autopct='%1.1f%%',\n",
+    "    title='Revenue Contribution by Customer'\n",
+    ")\n",
+    "\n",
+    "plt.ylabel('')\n",
+    "plt.show()\n",
+    "\n",
+    "✅ Insight\n",
+    "\n",
+    "A few customers often drive most revenue\n",
+    "\n",
+    "Useful for loyalty programs\n",
+    "\n",
+    "🔹 Cell 9: Order Value Distribution\n",
+    "\n",
+    "📉 Question: What does individual order value look like?\n",
+    "\n",
+    "sns.histplot(df['revenue'], bins=10, kde=True)\n",
+    "plt.title(\"Order Value Distribution\")\n",
+    "plt.xlabel(\"Order Revenue\")\n",
+    "plt.ylabel(\"Frequency\")\n",
+    "plt.show()\n",
+    "\n",
+    "✅ Insight\n",
+    "\n",
+    "Helps detect low/high-value orders\n",
+    "\n",
+    "Useful for pricing strategy\n",
+    "\n",
+    "🔹 Cell 10: Key Business Insights (Markdown Cell)\n",
+    "## 📌 Key Insights\n",
+    "\n",
+    "- Electronics is the highest revenue-generating category\n",
+    "- Revenue peaks in later months, indicating growth\n",
+    "- A small number of customers contribute a large share of revenue\n",
+    "- Most orders cluster around mid-range prices\n",
+    "\n",
+    "These insights can help improve:\n",
+    "- Inventory planning\n",
+    "- Marketing campaigns\n",
+    "- Customer retention strategies\n",
+    "\n",
+    "🎯 Challenge Tasks (For Learners)\n",
+    "## ✅ Try This Yourself\n",
+    "\n",
+    "1. Find the **best-selling product**\n",
+    "2. Calculate **average order value**\n",
+    "3. Identify **repeat customers**\n",
+    "4. Create a **daily sales trend**\n",
+    "5. Add a new visualization of your choice\n",
+    "\n",
+    "💡 Bonus: Turn this into a Streamlit dashboard!\n",
+    "\n",
+    "🚀 Next Steps\n",
+    "\n",
+    "Extend analysis with more data\n",
+    "\n",
+    "Add ML models (forecasting, clustering)\n",
+    "\n",
+    "Build dashboards (Streamlit)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tools/ecom-analytics/requirements.txt b/tools/ecom-analytics/requirements.txt
@@ -0,0 +1,7 @@
+pandas
+matplotlib
+numpy
+scipy
+seaborn
+streamlit
+jupyter
diff --git a/tools/ecom-analytics/src/analysis.py b/tools/ecom-analytics/src/analysis.py
@@ -0,0 +1,45 @@
+# simple CLI EDA script: tools/ecom-analytics/src/analysis.py
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+import argparse
+
+def basic_eda(df, out_dir="outputs"):
+    os.makedirs(out_dir, exist_ok=True)
+    # revenue over time
+    if 'order_date' in df.columns and 'revenue' in df.columns:
+        df['order_date'] = pd.to_datetime(df['order_date'])
+        monthly = df.groupby(pd.Grouper(key='order_date', freq='M'))['revenue'].sum()
+        plt.figure(figsize=(10,4))
+        monthly.plot()
+        plt.title('Monthly Revenue')
+        plt.xlabel('Month')
+        plt.ylabel('Revenue')
+        plt.tight_layout()
+        plt.savefig(os.path.join(out_dir, 'monthly_revenue.png'))
+        plt.close()
+
+    # category sales
+    if 'category' in df.columns and 'revenue' in df.columns:
+        cat = df.groupby('category')['revenue'].sum().sort_values(ascending=False).head(10)
+        plt.figure(figsize=(8,5))
+        cat.plot(kind='bar')
+        plt.title('Top Categories by Revenue')
+        plt.tight_layout()
+        plt.savefig(os.path.join(out_dir, 'top_categories.png'))
+        plt.close()
+
+    # more charts...
+    print("Saved charts to", out_dir)
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--csv', required=True, help='Path to dataset CSV')
+    parser.add_argument('--out', default='outputs', help='Output folder for plots')
+    args = parser.parse_args()
+
+    df = pd.read_csv(args.csv)
+    basic_eda(df, args.out)
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/ecom-analytics/src/streamlit_app.py b/tools/ecom-analytics/src/streamlit_app.py
@@ -0,0 +1,76 @@
+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+
+st.set_page_config("E-commerce Analytics Practice", layout="wide")
+st.title("📊 E-commerce Analytics — Practice Challenge")
+
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_PATH = os.path.join(BASE_DIR, "../data/ecommerce_sample.csv")
+
+# ---------- Instructions ----------
+with st.expander("📘 Challenge Instructions", expanded=True):
+    st.markdown("""
+    ### Tasks
+    1. Explore the dataset columns
+    2. Convert `order_date` to datetime
+    3. Find monthly sales trends
+    4. Identify top categories by revenue
+    5. Visualize your insights
+
+    ✅ Use the checkboxes to display charts  
+    ✅ Upload your own dataset to experiment
+    """)
+
+# ---------- Load Data ----------
+uploaded = st.file_uploader("Upload CSV (optional)", type=["csv"])
+use_sample = st.button("Use Sample Dataset")
+
+if uploaded:
+    df = pd.read_csv(uploaded)
+elif use_sample:
+    df = pd.read_csv(DATA_PATH)
+else:
+    st.info("Upload a CSV or use the sample dataset.")
+    st.stop()
+
+df.columns = df.columns.str.lower()
+
+st.subheader("📄 Dataset Preview")
+st.dataframe(df.head())
+
+# ---------- Validation ----------
+required_cols = {"order_date", "revenue", "category"}
+missing = required_cols - set(df.columns)
+
+if missing:
+    st.error(f"Missing required columns: {', '.join(missing)}")
+    st.stop()
+
+df["order_date"] = pd.to_datetime(df["order_date"], errors="coerce")
+
+# ---------- Visualizations ----------
+st.subheader("📈 Analysis & Visualization")
+
+if st.checkbox("✅ Monthly Revenue Trend"):
+    monthly = df.groupby(pd.Grouper(key="order_date", freq="M"))["revenue"].sum()
+    st.line_chart(monthly)
+
+if st.checkbox("✅ Top Categories by Revenue"):
+    fig, ax = plt.subplots()
+    df.groupby("category")["revenue"].sum().sort_values(ascending=False).plot(
+        kind="bar", ax=ax
+    )
+    ax.set_ylabel("Revenue")
+    st.pyplot(fig)
+
+# ---------- Insights ----------
+st.subheader("🧠 Write Your Insights")
+insight = st.text_area("What did you observe from the data?")
+
+if st.button("Submit Challenge"):
+    if insight.strip():
+        st.success("✅ Challenge completed! Great work.")
+    else:
+        st.warning("Please write at least one insight before submitting.")

-Original file line number
+Diff line change
@@ @@ -0,0 +1,7 @@ @@
 +pandas
 +matplotlib
 +numpy
 +scipy
 +seaborn
 +streamlit
 +jupyter