Skip to content

Commit 092ef05

Browse files
FIX: change dataset loading source from OpenML to Kaggle as OpenML is down as of 14/01/25 (#598)
1 parent 7736558 commit 092ef05

File tree

1 file changed

+24
-7
lines changed

1 file changed

+24
-7
lines changed

examples/regression/1-quickstart/plot_compare_conformity_scores.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
We use here the OpenML house_prices dataset:
1010
https://www.openml.org/search?type=data&sort=runs&id=42165&status=active.
1111
12+
Note : OpenML is down as of 14/01/25, so we'll load the data from Kaggle instead.
13+
1214
The data is modelled by a Random Forest model
1315
:class:`~sklearn.ensemble.RandomForestRegressor` with a fixed parameter set.
1416
The prediction intervals are determined by means of the MAPIE regressor
@@ -31,7 +33,10 @@
3133
"""
3234
import matplotlib.pyplot as plt
3335
import numpy as np
34-
from sklearn.datasets import fetch_openml
36+
import requests
37+
import zipfile
38+
import io
39+
import pandas as pd
3540
from sklearn.ensemble import RandomForestRegressor
3641
from sklearn.model_selection import train_test_split
3742

@@ -43,12 +48,14 @@
4348

4449
# Parameters
4550
features = [
46-
"MSSubClass",
47-
"LotArea",
48-
"OverallQual",
49-
"OverallCond",
50-
"GarageArea",
51+
"MS SubClass",
52+
"Lot Area",
53+
"Overall Qual",
54+
"Overall Cond",
55+
"Garage Area",
5156
]
57+
target = "SalePrice"
58+
5259
alpha = 0.05
5360
rf_kwargs = {"n_estimators": 10, "random_state": random_state}
5461
model = RandomForestRegressor(**rf_kwargs)
@@ -63,7 +70,17 @@
6370
# in such cases.
6471
# Two sub datasets are extracted: the training and test ones.
6572

66-
X, y = fetch_openml(name="house_prices", return_X_y=True)
73+
dataset_url = (
74+
"https://www.kaggle.com" +
75+
"/api/v1/datasets/download/shashanknecrothapa/ames-housing-dataset"
76+
)
77+
r = requests.get(dataset_url, stream=True)
78+
with zipfile.ZipFile(io.BytesIO(r.content)) as z:
79+
with z.open("AmesHousing.csv") as file:
80+
data = pd.read_csv(file)
81+
82+
X = data[features]
83+
y = data[target]
6784

6885
X_train, X_test, y_train, y_test = train_test_split(
6986
X[features], y, test_size=0.2, random_state=random_state

0 commit comments

Comments
 (0)