|
| 1 | +import numpy as np |
| 2 | +from sklearn.datasets import make_classification, make_circles, make_moons, make_blobs, make_gaussian_quantiles |
| 3 | + |
| 4 | + |
| 5 | +def generate_classification_dataset(dataset_type, n_samples): |
| 6 | + """Generate a classification dataset. Returns X (datapoints), y (labels)""" |
| 7 | + n_classes = 4 |
| 8 | + |
| 9 | + if dataset_type == "classification": |
| 10 | + return make_classification(n_samples=n_samples, n_features=2, n_classes=n_classes, |
| 11 | + n_clusters_per_class=1, n_redundant=0) |
| 12 | + elif dataset_type == "aniso": |
| 13 | + return make_classification(n_samples=n_samples, n_features=2, n_classes=n_classes, n_informative=2, n_redundant=0, |
| 14 | + n_clusters_per_class=1, class_sep=2) |
| 15 | + elif dataset_type == "blobs": |
| 16 | + return make_blobs(n_samples=n_samples, centers=n_classes) |
| 17 | + elif dataset_type == "varied_density": |
| 18 | + return make_blobs(n_samples=n_samples, centers=n_classes, cluster_std=np.random.choice([0.5, 1.0, 2.0, 0.1], n_classes)) |
| 19 | + elif dataset_type == "outliers_with_clusters": |
| 20 | + X, y = make_blobs(n_samples=n_samples, centers=n_classes) |
| 21 | + X[:20] += 10 # Add outliers |
| 22 | + return X, y |
| 23 | + elif dataset_type == "star_cluster": |
| 24 | + X, y = [], [] |
| 25 | + angles = np.linspace(0, 2 * np.pi, n_classes, endpoint=False) |
| 26 | + |
| 27 | + for i, angle in enumerate(angles): |
| 28 | + r = np.random.uniform(0.5, 1.5, n_samples // n_classes) |
| 29 | + x1 = r * np.cos(angle) + np.random.normal(0, 0.2, size=r.shape) |
| 30 | + x2 = r * np.sin(angle) + np.random.normal(0, 0.2, size=r.shape) |
| 31 | + |
| 32 | + X.append(np.column_stack((x1, x2))) |
| 33 | + y.append(np.full_like(x1, i)) |
| 34 | + |
| 35 | + X = np.vstack(X) |
| 36 | + y = np.concatenate(y).astype(int) |
| 37 | + return X, y |
| 38 | + elif dataset_type == "checkerboard": |
| 39 | + X = np.random.rand(n_samples, 2) |
| 40 | + if n_classes == 4: |
| 41 | + y = [] |
| 42 | + for datapoint in X: |
| 43 | + if datapoint[0] < 0.5 and datapoint[1] < 0.5: |
| 44 | + y.append(0) |
| 45 | + elif datapoint[0] < 0.5 and datapoint[1] >= 0.5: |
| 46 | + y.append(1) |
| 47 | + elif datapoint[0] >= 0.5 and datapoint[1] < 0.5: |
| 48 | + y.append(2) |
| 49 | + elif datapoint[0] >= 0.5 and datapoint[1] >= 0.5: |
| 50 | + y.append(3) |
| 51 | + y = np.asarray(y) |
| 52 | + else: |
| 53 | + y = ((np.floor(X[:, 0] * 2) + np.floor(X[:, 1] * 2)) % n_classes).astype(int) |
| 54 | + return X, y |
| 55 | + elif dataset_type == "concentric_rings": |
| 56 | + radii = np.linspace(0.5, 2.0, n_classes) |
| 57 | + X, y = [], [] |
| 58 | + |
| 59 | + for i, r in enumerate(radii[:n_classes]): # Support up to 4 classes |
| 60 | + theta = np.linspace(0, 2 * np.pi, n_samples // n_classes) |
| 61 | + x1 = r * np.cos(theta) + np.random.normal(0, 0.1, size=theta.shape) |
| 62 | + x2 = r * np.sin(theta) + np.random.normal(0, 0.1, size=theta.shape) |
| 63 | + X.append(np.column_stack((x1, x2))) |
| 64 | + y.append(np.full_like(x1, i)) |
| 65 | + |
| 66 | + X = np.vstack(X) |
| 67 | + y = np.concatenate(y).astype(int) |
| 68 | + return X, y |
| 69 | + elif dataset_type == "ball": |
| 70 | + return make_gaussian_quantiles(n_samples=n_samples, n_features=2, n_classes=n_classes) |
| 71 | + elif dataset_type == "moons": |
| 72 | + X_1, y_1 = make_moons(n_samples=n_samples, noise=0.1, random_state=42) |
| 73 | + X_2, y_2 = make_moons(n_samples=n_samples, noise=0.1, random_state=42) |
| 74 | + for idx, datapoint in enumerate(X_2): |
| 75 | + if y_2[idx] == 0: |
| 76 | + datapoint[1] += 1 |
| 77 | + else: |
| 78 | + datapoint[1] -= 1 |
| 79 | + y_2 += 2 |
| 80 | + |
| 81 | + X = np.concatenate((X_1, X_2), axis=0) |
| 82 | + y = np.concatenate((y_1, y_2), axis=0) |
| 83 | + return X, y |
| 84 | + elif dataset_type == "wavy_clusters": |
| 85 | + X, y = [], [] |
| 86 | + x1 = np.linspace(-1, 1, n_samples // n_classes) |
| 87 | + |
| 88 | + for i in range(n_classes): |
| 89 | + x2 = np.sin(5 * np.pi * x1) + np.random.normal(0, 0.1, size=x1.shape) + 2 * i |
| 90 | + X.append(np.column_stack((x1, x2))) |
| 91 | + y.append(np.full_like(x1, i)) |
| 92 | + |
| 93 | + X = np.vstack(X) |
| 94 | + y = np.concatenate(y).astype(int) |
| 95 | + return X, y |
| 96 | + elif dataset_type == "s_curves": |
| 97 | + x1 = np.linspace(-1, 1, n_samples // n_classes) |
| 98 | + X, y = [], [] |
| 99 | + |
| 100 | + for i in range(n_classes): |
| 101 | + x2 = np.sin(2 * np.pi * x1) + np.random.normal(0, 0.1, size=x1.shape) + i |
| 102 | + X.append(np.column_stack((x1, x2))) |
| 103 | + y.append(np.full_like(x1, i)) |
| 104 | + |
| 105 | + X = np.vstack(X) |
| 106 | + y = np.concatenate(y).astype(int) |
| 107 | + return X, y |
| 108 | + elif dataset_type == "spiral": |
| 109 | + theta = np.linspace(0, 4 * np.pi, n_samples) |
| 110 | + r = np.linspace(0, 1, n_samples) |
| 111 | + X = np.column_stack([r * np.sin(theta), r * np.cos(theta)]) |
| 112 | + y = np.zeros(n_samples, dtype=int) |
| 113 | + if n_classes == 2: |
| 114 | + y[r > 0.5] = 1 |
| 115 | + elif n_classes == 3: |
| 116 | + y[r > 0.33] = 1 |
| 117 | + y[r > 0.66] = 2 |
| 118 | + elif n_classes == 4: |
| 119 | + y[r > 0.25] = 1 |
| 120 | + y[r > 0.5] = 2 |
| 121 | + y[r > 0.75] = 3 |
| 122 | + return X, y |
| 123 | + elif dataset_type == "multiple_spirals": |
| 124 | + n_samples_per_class = n_samples // n_classes |
| 125 | + X, y = [], [] |
| 126 | + |
| 127 | + centers = [(i * 1, i * 1) for i in range(n_classes)] # Different starting centers |
| 128 | + |
| 129 | + for i, (cx, cy) in enumerate(centers): |
| 130 | + t = np.linspace(0, 2 * np.pi, n_samples_per_class) # Spiral shape |
| 131 | + x = cx + t * np.cos(t) + 0.1 * np.random.randn(n_samples_per_class) |
| 132 | + y_coord = cy + t * np.sin(t) + 0.1 * np.random.randn(n_samples_per_class) |
| 133 | + X.append(np.column_stack((x, y_coord))) |
| 134 | + y.append(np.full(n_samples_per_class, i)) |
| 135 | + |
| 136 | + X = np.vstack(X) |
| 137 | + y = np.hstack(y).astype(int) |
| 138 | + return X, y |
| 139 | + elif dataset_type == "multiarm_spiral": |
| 140 | + n_samples_per_class = n_samples // n_classes |
| 141 | + X, y = [], [] |
| 142 | + |
| 143 | + for i in range(n_classes): |
| 144 | + t = np.linspace(0, 3 * 2 * np.pi, n_samples_per_class) # Spiral shape |
| 145 | + angle_offset = (i / n_classes) * (2 * np.pi) # Offset each spiral arm |
| 146 | + x = (t + 1) * np.cos(t + angle_offset) + 0.1 * np.random.randn(n_samples_per_class) |
| 147 | + y_coord = (t + 1) * np.sin(t + angle_offset) + 0.1 * np.random.randn(n_samples_per_class) |
| 148 | + X.append(np.column_stack((x, y_coord))) |
| 149 | + y.append(np.full(n_samples_per_class, i)) |
| 150 | + |
| 151 | + X = np.vstack(X) |
| 152 | + y = np.hstack(y).astype(int) |
| 153 | + return X, y |
0 commit comments