Skip to content

Commit f883e3a

Browse files
Khauneesh-AIKeivan Vosoughi
authored andcommitted
added analyser files required for previous commit
1 parent 915f553 commit f883e3a

File tree

3 files changed

+568
-0
lines changed

3 files changed

+568
-0
lines changed

app/core/data_analyser.py

Lines changed: 357 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,357 @@
1+
import pandas as pd
2+
import numpy as np
3+
import warnings
4+
from typing import Dict, List, Any, Union, Optional, Tuple
5+
import math
6+
7+
class DataAnalyser:
8+
"""Utility class for analyzing datasets and providing statistical insights."""
9+
10+
@classmethod
11+
def analyse(cls, df: pd.DataFrame, correlation_threshold: float = 0.7) -> Dict[str, Any]:
12+
"""
13+
Analyze a DataFrame and extract useful statistics and insights.
14+
15+
Args:
16+
df: Input DataFrame to analyze
17+
correlation_threshold: Threshold for identifying strong correlations
18+
19+
Returns:
20+
Dictionary containing analysis results
21+
"""
22+
print("Analyzing data...")
23+
24+
# Initialize results structure
25+
results = {"columns": [],
26+
"grp_columns": {},
27+
"statistical_analysis": {},
28+
"cross_row_relationship": {},
29+
"cross_column_relationship": {}
30+
}
31+
32+
# Categorize columns
33+
results["grp_columns"] = cls.categorize_columns(df)
34+
results["columns"]= df.columns.tolist()
35+
36+
# Analyze each type of column
37+
stats = {}
38+
if results["grp_columns"]["numeric"]:
39+
stats["numeric"] = cls.analyze_numeric_columns(df, results["grp_columns"]["numeric"])
40+
41+
if results["grp_columns"]["categorical"]:
42+
stats["categorical"] = cls.analyze_categorical_columns(df, results["grp_columns"]["categorical"])
43+
44+
if results["grp_columns"]["datetime"]:
45+
stats["datetime"] = cls.analyze_datetime_columns(df, results["grp_columns"]["datetime"])
46+
47+
results["statistical_analysis"] = stats
48+
49+
# Analyze cross-row relationships
50+
results["cross_row_relationship"] = cls.analyze_cross_row_relationships(df)
51+
52+
# Analyze cross-column relationships
53+
if results["grp_columns"]["numeric"] and len(results["grp_columns"]["numeric"]) > 1:
54+
results["cross_column_relationship"] = cls.analyze_cross_column_relationships(
55+
df, results["grp_columns"]["numeric"], correlation_threshold
56+
)
57+
58+
return results
59+
60+
@classmethod
61+
def categorize_columns(cls, df: pd.DataFrame) -> Dict[str, List[str]]:
62+
"""
63+
Categorize DataFrame columns by their data types.
64+
65+
Args:
66+
df: Input DataFrame
67+
68+
Returns:
69+
Dictionary mapping column types to lists of column names
70+
"""
71+
result = {
72+
"numeric": [],
73+
"categorical": [],
74+
"datetime": [],
75+
"text": [],
76+
"other": []
77+
}
78+
79+
for col in df.columns:
80+
column = df[col]
81+
82+
# Check if already datetime type - most reliable method
83+
if pd.api.types.is_datetime64_any_dtype(column):
84+
result["datetime"].append(col)
85+
86+
# Check numeric types
87+
elif pd.api.types.is_numeric_dtype(column) and not pd.api.types.is_bool_dtype(column):
88+
result["numeric"].append(col)
89+
90+
# Check categorical and boolean
91+
elif pd.api.types.is_categorical_dtype(column) or pd.api.types.is_bool_dtype(column):
92+
result["categorical"].append(col)
93+
94+
# Check for text columns
95+
elif pd.api.types.is_string_dtype(column) or pd.api.types.is_object_dtype(column):
96+
# Check if more than 50% of non-null values are likely categorical (few unique values)
97+
non_null_count = column.count()
98+
if non_null_count > 0:
99+
unique_ratio = column.nunique() / non_null_count
100+
if unique_ratio < 0.2: # If less than 20% of values are unique, consider categorical
101+
result["categorical"].append(col)
102+
else:
103+
result["text"].append(col)
104+
else:
105+
result["text"].append(col)
106+
107+
# Everything else
108+
else:
109+
result["other"].append(col)
110+
111+
# Verify all columns are categorized
112+
categorized = []
113+
for category, cols in result.items():
114+
categorized.extend(cols)
115+
116+
missing = set(df.columns) - set(categorized)
117+
if missing:
118+
print(f"Found uncategorized columns: {missing}")
119+
result["other"].extend(list(missing))
120+
121+
return result
122+
123+
@classmethod
124+
def analyze_numeric_columns(cls, df: pd.DataFrame, numeric_columns: List[str]) -> Dict[str, Dict[str, Any]]:
125+
"""
126+
Analyze numeric columns to extract statistical information.
127+
128+
Args:
129+
df: Input DataFrame
130+
numeric_columns: List of numeric column names
131+
132+
Returns:
133+
Dictionary mapping column names to their statistics
134+
"""
135+
result = {}
136+
137+
for col in numeric_columns:
138+
# Skip columns with all NaN values
139+
if df[col].isna().all():
140+
continue
141+
142+
stats = {}
143+
144+
# Basic statistics
145+
stats["count"] = int(df[col].count())
146+
stats["mean"] = float(df[col].mean())
147+
stats["median"] = float(df[col].median())
148+
stats["std"] = float(df[col].std())
149+
stats["min"] = float(df[col].min())
150+
stats["max"] = float(df[col].max())
151+
152+
# Calculate percentiles
153+
for p in [25, 75, 90, 95, 99]:
154+
stats[f"p{p}"] = float(df[col].quantile(p/100))
155+
156+
# Null value statistics
157+
null_count = int(df[col].isna().sum())
158+
stats["null_count"] = null_count
159+
stats["null_percentage"] = float((null_count / len(df)) * 100)
160+
161+
result[col] = stats
162+
163+
return result
164+
165+
@classmethod
166+
def analyze_categorical_columns(cls, df: pd.DataFrame, categorical_columns: List[str]) -> Dict[str, Dict[str, Any]]:
167+
"""
168+
Analyze categorical columns to extract distribution information.
169+
170+
Args:
171+
df: Input DataFrame
172+
categorical_columns: List of categorical column names
173+
174+
Returns:
175+
Dictionary mapping column names to their statistics
176+
"""
177+
result = {}
178+
179+
for col in categorical_columns:
180+
# Skip columns with all NaN values
181+
if df[col].isna().all():
182+
continue
183+
184+
stats = {}
185+
186+
# Basic statistics
187+
stats["count"] = int(df[col].count())
188+
stats["unique_count"] = int(df[col].nunique())
189+
190+
# Value distribution (top 10 most common values)
191+
value_counts = df[col].value_counts().head(10).to_dict()
192+
# Convert any non-string keys to strings for JSON compatibility
193+
top_values = {}
194+
for k, v in value_counts.items():
195+
key = str(k) if not isinstance(k, str) else k
196+
top_values[key] = int(v)
197+
198+
stats["top_values"] = top_values
199+
200+
# Calculate entropy to measure randomness
201+
counts = df[col].value_counts()
202+
probs = counts / counts.sum()
203+
entropy = -np.sum(probs * np.log2(probs))
204+
stats["entropy"] = float(entropy)
205+
206+
# Null value statistics
207+
null_count = int(df[col].isna().sum())
208+
stats["null_count"] = null_count
209+
stats["null_percentage"] = float((null_count / len(df)) * 100)
210+
211+
result[col] = stats
212+
213+
return result
214+
215+
@classmethod
216+
def analyze_datetime_columns(cls, df: pd.DataFrame, datetime_columns: List[str]) -> Dict[str, Dict[str, Any]]:
217+
"""
218+
Analyze datetime columns to extract temporal patterns.
219+
220+
Args:
221+
df: Input DataFrame
222+
datetime_columns: List of datetime column names
223+
224+
Returns:
225+
Dictionary mapping column names to their statistics
226+
"""
227+
result = {}
228+
229+
for col in datetime_columns:
230+
# Skip columns with all NaN values
231+
if df[col].isna().all():
232+
continue
233+
234+
stats = {}
235+
236+
# Basic statistics
237+
stats["count"] = int(df[col].count())
238+
stats["min"] = str(df[col].min())
239+
stats["max"] = str(df[col].max())
240+
241+
# Calculate temporal span
242+
min_date = df[col].min()
243+
max_date = df[col].max()
244+
if pd.notna(min_date) and pd.notna(max_date):
245+
span_days = (max_date - min_date).total_seconds() / (60 * 60 * 24)
246+
stats["span_days"] = float(span_days)
247+
248+
# Extract date parts distribution
249+
date_parts = {}
250+
251+
# Year distribution
252+
if df[col].dt.year.nunique() > 1:
253+
year_counts = df[col].dt.year.value_counts().to_dict()
254+
date_parts["year"] = {str(k): int(v) for k, v in year_counts.items()}
255+
256+
# Month distribution
257+
month_counts = df[col].dt.month.value_counts().to_dict()
258+
date_parts["month"] = {str(k): int(v) for k, v in month_counts.items()}
259+
260+
# Day of week distribution
261+
dow_counts = df[col].dt.dayofweek.value_counts().to_dict()
262+
date_parts["day_of_week"] = {str(k): int(v) for k, v in dow_counts.items()}
263+
264+
# Hour distribution (if time component exists)
265+
if (df[col].dt.hour != 0).any():
266+
hour_counts = df[col].dt.hour.value_counts().to_dict()
267+
date_parts["hour"] = {str(k): int(v) for k, v in hour_counts.items()}
268+
269+
stats["date_parts"] = date_parts
270+
271+
# Null value statistics
272+
null_count = int(df[col].isna().sum())
273+
stats["null_count"] = null_count
274+
stats["null_percentage"] = float((null_count / len(df)) * 100)
275+
276+
result[col] = stats
277+
278+
return result
279+
280+
@classmethod
281+
def analyze_cross_row_relationships(cls, df: pd.DataFrame) -> Dict[str, Any]:
282+
"""
283+
Analyze relationships across rows, such as duplicates and null patterns.
284+
285+
Args:
286+
df: Input DataFrame
287+
288+
Returns:
289+
Dictionary containing cross-row relationship information
290+
"""
291+
result = {}
292+
293+
# Analyze duplicates
294+
duplicates = df.duplicated()
295+
duplicate_count = int(duplicates.sum())
296+
duplicate_percentage = float((duplicate_count / len(df)) * 100)
297+
298+
result["duplicates"] = {
299+
"count": duplicate_count,
300+
"percentage": duplicate_percentage
301+
}
302+
303+
# Analyze rows with null values
304+
rows_with_null = df.isna().any(axis=1)
305+
null_rows_count = int(rows_with_null.sum())
306+
null_rows_percentage = float((null_rows_count / len(df)) * 100)
307+
308+
result["null_rows"] = {
309+
"count": null_rows_count,
310+
"percentage": null_rows_percentage
311+
}
312+
313+
return result
314+
315+
@classmethod
316+
def analyze_cross_column_relationships(
317+
cls, df: pd.DataFrame, numeric_columns: List[str], correlation_threshold: float
318+
) -> Dict[str, Any]:
319+
"""
320+
Analyze relationships between columns, such as correlations.
321+
322+
Args:
323+
df: Input DataFrame
324+
numeric_columns: List of numeric column names
325+
correlation_threshold: Threshold for identifying strong correlations
326+
327+
Returns:
328+
Dictionary containing cross-column relationship information
329+
"""
330+
result = {}
331+
332+
# Calculate correlations between numeric columns
333+
with warnings.catch_warnings():
334+
warnings.simplefilter("ignore")
335+
corr_matrix = df[numeric_columns].corr()
336+
337+
# Extract strong correlations (ignore self-correlations)
338+
strong_correlations = {}
339+
for i in range(len(numeric_columns)):
340+
for j in range(i+1, len(numeric_columns)):
341+
col1 = numeric_columns[i]
342+
col2 = numeric_columns[j]
343+
corr_value = corr_matrix.iloc[i, j]
344+
345+
# Skip NaN correlations
346+
if pd.isna(corr_value):
347+
continue
348+
349+
# Store absolute correlation values above threshold
350+
if abs(corr_value) >= correlation_threshold:
351+
pair_name = f"{col1} - {col2}"
352+
strong_correlations[pair_name] = float(corr_value)
353+
354+
if strong_correlations:
355+
result["correlations"] = strong_correlations
356+
357+
return result

0 commit comments

Comments
 (0)