-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDataSetGen.py
More file actions
61 lines (47 loc) · 1.61 KB
/
DataSetGen.py
File metadata and controls
61 lines (47 loc) · 1.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import numpy as np
import pandas as pd
def generate_cluster_data(N, K, P, random_seed=None):
"""
Generate N 2D points clustered into K groups with variance P.
Parameters:
- N: int, total number of points
- K: int, number of clusters
- P: float, variance within each cluster
- random_seed: int or None, for reproducibility
Returns:
- data: np.ndarray of shape (N, 2), the generated 2D points
"""
if random_seed is not None:
np.random.seed(random_seed)
points_per_cluster = [N // K] * K
for i in range(N % K):
points_per_cluster[i] += 1
# Randomly place cluster centers in a bounding box
cluster_centers = np.random.uniform(-10, 10, size=(K, 2))
data = []
for i in range(K):
# Generate points around each center
cluster_points = (
np.random.randn(points_per_cluster[i], 2) * np.sqrt(P) + cluster_centers[i]
)
data.append(cluster_points)
return np.vstack(data)
def load_fund_data_as_numpy(file_path):
# Read Excel file
df = pd.read_excel(file_path, header = 1)
# Optional: clean Rating (convert stars to integer, e.g. '5★' → 5)
df["Rating"] = df["Rating"].str.extract(r"(\d)").astype(int)
# Select only numeric columns (or explicitly specify desired columns)
feature_columns = [
"Return(%)",
"Volatility(%)",
"ExpenseRatio",
"Equity(%)",
"Debt(%)",
"AUM(Cr)",
"Rating",
"Inflows(Cr)",
]
# Convert to NumPy array
feature_matrix = df[feature_columns].to_numpy(dtype=float)
return feature_matrix