Skip to content

Commit f4609c5

Browse files
authored
Merge pull request #96 from kananmmehta/add-py-vis-module
adding python data-vis module
2 parents 5418a2e + 9148969 commit f4609c5

File tree

6 files changed

+359
-0
lines changed

6 files changed

+359
-0
lines changed

tools/ecom-analytics/Dockerfile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
FROM python:3.11-slim
2+
WORKDIR /app
3+
COPY requirements.txt .
4+
RUN pip install --no-cache-dir -r requirements.txt
5+
COPY . .
6+
EXPOSE 8501
7+
CMD ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
order_id,order_date,category,product,quantity,revenue,customer_type
2+
1001,2024-01-05,Electronics,Headphones,2,400,New
3+
1002,2024-01-12,Clothing,T-Shirt,3,75,Returning
4+
1003,2024-02-03,Electronics,Keyboard,1,120,Returning
5+
1004,2024-02-15,Home,Blender,1,220,New
6+
1005,2024-03-01,Clothing,Jacket,1,150,Returning
7+
1006,2024-03-18,Home,Vacuum Cleaner,1,500,New
8+
1007,2024-03-22,Electronics,Mouse,4,160,Returning
9+
1008,2024-04-05,Clothing,Shoes,2,300,New
10+
1009,2024-04-19,Electronics,Monitor,1,350,Returning
11+
1010,2024-05-10,Home,Coffee Maker,1,180,New
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "f00ac002",
6+
"metadata": {},
7+
"source": [
8+
"📊 E-Commerce Data Analysis & Visualization\n",
9+
"Python Practice Challenge\n",
10+
"🧠 Objective\n",
11+
"\n",
12+
"This notebook helps you practice real-world Python data analysis skills by working with a small e-commerce dataset.\n",
13+
"You will:\n",
14+
"\n",
15+
"Load and explore data\n",
16+
"\n",
17+
"Analyze trends\n",
18+
"\n",
19+
"Create meaningful visualizations\n",
20+
"\n",
21+
"Draw business insights\n",
22+
"\n",
23+
"✅ Skills Practiced\n",
24+
"\n",
25+
"Pandas (data manipulation)\n",
26+
"\n",
27+
"Matplotlib & Seaborn (visualization)\n",
28+
"\n",
29+
"Exploratory Data Analysis (EDA)\n",
30+
"\n",
31+
"Thinking like a data analyst 🎯\n",
32+
"\n",
33+
"📁 Dataset\n",
34+
"\n",
35+
"We use a sample e-commerce dataset containing:\n",
36+
"\n",
37+
"Orders\n",
38+
"\n",
39+
"Dates\n",
40+
"\n",
41+
"Categories\n",
42+
"\n",
43+
"Customers\n",
44+
"\n",
45+
"Revenue\n",
46+
"\n",
47+
"🔹 Cell 1: Import Libraries\n",
48+
"import pandas as pd\n",
49+
"import matplotlib.pyplot as plt\n",
50+
"import seaborn as sns\n",
51+
"\n",
52+
"# visualization settings\n",
53+
"sns.set(style=\"whitegrid\")\n",
54+
"plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
55+
"\n",
56+
"🔹 Cell 2: Load Dataset\n",
57+
"\n",
58+
"If you're running this via Binder or locally, this path works.\n",
59+
"If using Colab, upload the CSV manually.\n",
60+
"\n",
61+
"df = pd.read_csv(\"../../public/data/ecommerce_sample.csv\")\n",
62+
"df.head()\n",
63+
"\n",
64+
"🔹 Cell 3: Basic Dataset Info\n",
65+
"df.info()\n",
66+
"\n",
67+
"df.describe()\n",
68+
"\n",
69+
"🔹 Cell 4: Datetime Conversion\n",
70+
"df['order_date'] = pd.to_datetime(df['order_date'])\n",
71+
"\n",
72+
"df.dtypes\n",
73+
"\n",
74+
"\n",
75+
"✅ This allows time-based analysis.\n",
76+
"\n",
77+
"🔹 Cell 5: Monthly Revenue Trend\n",
78+
"\n",
79+
"📈 Question: How does revenue change over time?\n",
80+
"\n",
81+
"monthly_revenue = (\n",
82+
" df\n",
83+
" .groupby(df['order_date'].dt.to_period('M'))['revenue']\n",
84+
" .sum()\n",
85+
")\n",
86+
"\n",
87+
"monthly_revenue.plot(\n",
88+
" kind='line',\n",
89+
" marker='o',\n",
90+
" title='Monthly Revenue Trend'\n",
91+
")\n",
92+
"\n",
93+
"plt.xlabel(\"Month\")\n",
94+
"plt.ylabel(\"Total Revenue\")\n",
95+
"plt.show()\n",
96+
"\n",
97+
"✅ Insight\n",
98+
"\n",
99+
"Identify peak sales months\n",
100+
"\n",
101+
"Observe growth or decline trends\n",
102+
"\n",
103+
"🔹 Cell 6: Revenue by Category\n",
104+
"\n",
105+
"📊 Question: Which product category generates the most revenue?\n",
106+
"\n",
107+
"category_sales = (\n",
108+
" df.groupby('category')['revenue']\n",
109+
" .sum()\n",
110+
" .sort_values(ascending=False)\n",
111+
")\n",
112+
"\n",
113+
"sns.barplot(\n",
114+
" x=category_sales.index,\n",
115+
" y=category_sales.values\n",
116+
")\n",
117+
"plt.title(\"Revenue by Product Category\")\n",
118+
"plt.xlabel(\"Category\")\n",
119+
"plt.ylabel(\"Revenue\")\n",
120+
"plt.show()\n",
121+
"\n",
122+
"✅ Insight\n",
123+
"\n",
124+
"Focus marketing on top-performing categories\n",
125+
"\n",
126+
"Spot underperforming ones\n",
127+
"\n",
128+
"🔹 Cell 7: Top Customers\n",
129+
"\n",
130+
"👥 Question: Who are the highest-value customers?\n",
131+
"\n",
132+
"customer_sales = (\n",
133+
" df.groupby('customer')['revenue']\n",
134+
" .sum()\n",
135+
" .sort_values(ascending=False)\n",
136+
")\n",
137+
"\n",
138+
"customer_sales\n",
139+
"\n",
140+
"🔹 Cell 8: Customer Revenue Share (Pie Chart)\n",
141+
"customer_sales.plot(\n",
142+
" kind='pie',\n",
143+
" autopct='%1.1f%%',\n",
144+
" title='Revenue Contribution by Customer'\n",
145+
")\n",
146+
"\n",
147+
"plt.ylabel('')\n",
148+
"plt.show()\n",
149+
"\n",
150+
"✅ Insight\n",
151+
"\n",
152+
"A few customers often drive most revenue\n",
153+
"\n",
154+
"Useful for loyalty programs\n",
155+
"\n",
156+
"🔹 Cell 9: Order Value Distribution\n",
157+
"\n",
158+
"📉 Question: What does individual order value look like?\n",
159+
"\n",
160+
"sns.histplot(df['revenue'], bins=10, kde=True)\n",
161+
"plt.title(\"Order Value Distribution\")\n",
162+
"plt.xlabel(\"Order Revenue\")\n",
163+
"plt.ylabel(\"Frequency\")\n",
164+
"plt.show()\n",
165+
"\n",
166+
"✅ Insight\n",
167+
"\n",
168+
"Helps detect low/high-value orders\n",
169+
"\n",
170+
"Useful for pricing strategy\n",
171+
"\n",
172+
"🔹 Cell 10: Key Business Insights (Markdown Cell)\n",
173+
"## 📌 Key Insights\n",
174+
"\n",
175+
"- Electronics is the highest revenue-generating category\n",
176+
"- Revenue peaks in later months, indicating growth\n",
177+
"- A small number of customers contribute a large share of revenue\n",
178+
"- Most orders cluster around mid-range prices\n",
179+
"\n",
180+
"These insights can help improve:\n",
181+
"- Inventory planning\n",
182+
"- Marketing campaigns\n",
183+
"- Customer retention strategies\n",
184+
"\n",
185+
"🎯 Challenge Tasks (For Learners)\n",
186+
"## ✅ Try This Yourself\n",
187+
"\n",
188+
"1. Find the **best-selling product**\n",
189+
"2. Calculate **average order value**\n",
190+
"3. Identify **repeat customers**\n",
191+
"4. Create a **daily sales trend**\n",
192+
"5. Add a new visualization of your choice\n",
193+
"\n",
194+
"💡 Bonus: Turn this into a Streamlit dashboard!\n",
195+
"\n",
196+
"🚀 Next Steps\n",
197+
"\n",
198+
"Extend analysis with more data\n",
199+
"\n",
200+
"Add ML models (forecasting, clustering)\n",
201+
"\n",
202+
"Build dashboards (Streamlit)"
203+
]
204+
}
205+
],
206+
"metadata": {
207+
"language_info": {
208+
"name": "python"
209+
}
210+
},
211+
"nbformat": 4,
212+
"nbformat_minor": 5
213+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
pandas
2+
matplotlib
3+
numpy
4+
scipy
5+
seaborn
6+
streamlit
7+
jupyter
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# simple CLI EDA script: tools/ecom-analytics/src/analysis.py
2+
import pandas as pd
3+
import matplotlib.pyplot as plt
4+
import os
5+
import argparse
6+
7+
def basic_eda(df, out_dir="outputs"):
8+
os.makedirs(out_dir, exist_ok=True)
9+
# revenue over time
10+
if 'order_date' in df.columns and 'revenue' in df.columns:
11+
df['order_date'] = pd.to_datetime(df['order_date'])
12+
monthly = df.groupby(pd.Grouper(key='order_date', freq='M'))['revenue'].sum()
13+
plt.figure(figsize=(10,4))
14+
monthly.plot()
15+
plt.title('Monthly Revenue')
16+
plt.xlabel('Month')
17+
plt.ylabel('Revenue')
18+
plt.tight_layout()
19+
plt.savefig(os.path.join(out_dir, 'monthly_revenue.png'))
20+
plt.close()
21+
22+
# category sales
23+
if 'category' in df.columns and 'revenue' in df.columns:
24+
cat = df.groupby('category')['revenue'].sum().sort_values(ascending=False).head(10)
25+
plt.figure(figsize=(8,5))
26+
cat.plot(kind='bar')
27+
plt.title('Top Categories by Revenue')
28+
plt.tight_layout()
29+
plt.savefig(os.path.join(out_dir, 'top_categories.png'))
30+
plt.close()
31+
32+
# more charts...
33+
print("Saved charts to", out_dir)
34+
35+
def main():
36+
parser = argparse.ArgumentParser()
37+
parser.add_argument('--csv', required=True, help='Path to dataset CSV')
38+
parser.add_argument('--out', default='outputs', help='Output folder for plots')
39+
args = parser.parse_args()
40+
41+
df = pd.read_csv(args.csv)
42+
basic_eda(df, args.out)
43+
44+
if __name__ == "__main__":
45+
main()
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import streamlit as st
2+
import pandas as pd
3+
import matplotlib.pyplot as plt
4+
import os
5+
6+
st.set_page_config("E-commerce Analytics Practice", layout="wide")
7+
st.title("📊 E-commerce Analytics — Practice Challenge")
8+
9+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
10+
DATA_PATH = os.path.join(BASE_DIR, "../data/ecommerce_sample.csv")
11+
12+
# ---------- Instructions ----------
13+
with st.expander("📘 Challenge Instructions", expanded=True):
14+
st.markdown("""
15+
### Tasks
16+
1. Explore the dataset columns
17+
2. Convert `order_date` to datetime
18+
3. Find monthly sales trends
19+
4. Identify top categories by revenue
20+
5. Visualize your insights
21+
22+
✅ Use the checkboxes to display charts
23+
✅ Upload your own dataset to experiment
24+
""")
25+
26+
# ---------- Load Data ----------
27+
uploaded = st.file_uploader("Upload CSV (optional)", type=["csv"])
28+
use_sample = st.button("Use Sample Dataset")
29+
30+
if uploaded:
31+
df = pd.read_csv(uploaded)
32+
elif use_sample:
33+
df = pd.read_csv(DATA_PATH)
34+
else:
35+
st.info("Upload a CSV or use the sample dataset.")
36+
st.stop()
37+
38+
df.columns = df.columns.str.lower()
39+
40+
st.subheader("📄 Dataset Preview")
41+
st.dataframe(df.head())
42+
43+
# ---------- Validation ----------
44+
required_cols = {"order_date", "revenue", "category"}
45+
missing = required_cols - set(df.columns)
46+
47+
if missing:
48+
st.error(f"Missing required columns: {', '.join(missing)}")
49+
st.stop()
50+
51+
df["order_date"] = pd.to_datetime(df["order_date"], errors="coerce")
52+
53+
# ---------- Visualizations ----------
54+
st.subheader("📈 Analysis & Visualization")
55+
56+
if st.checkbox("✅ Monthly Revenue Trend"):
57+
monthly = df.groupby(pd.Grouper(key="order_date", freq="M"))["revenue"].sum()
58+
st.line_chart(monthly)
59+
60+
if st.checkbox("✅ Top Categories by Revenue"):
61+
fig, ax = plt.subplots()
62+
df.groupby("category")["revenue"].sum().sort_values(ascending=False).plot(
63+
kind="bar", ax=ax
64+
)
65+
ax.set_ylabel("Revenue")
66+
st.pyplot(fig)
67+
68+
# ---------- Insights ----------
69+
st.subheader("🧠 Write Your Insights")
70+
insight = st.text_area("What did you observe from the data?")
71+
72+
if st.button("Submit Challenge"):
73+
if insight.strip():
74+
st.success("✅ Challenge completed! Great work.")
75+
else:
76+
st.warning("Please write at least one insight before submitting.")

0 commit comments

Comments
 (0)