|
9 | 9 | We use here the OpenML house_prices dataset: |
10 | 10 | https://www.openml.org/search?type=data&sort=runs&id=42165&status=active. |
11 | 11 |
|
| 12 | +Note : OpenML is down as of 14/01/25, so we'll load the data from Kaggle instead. |
| 13 | +
|
12 | 14 | The data is modelled by a Random Forest model |
13 | 15 | :class:`~sklearn.ensemble.RandomForestRegressor` with a fixed parameter set. |
14 | 16 | The prediction intervals are determined by means of the MAPIE regressor |
|
31 | 33 | """ |
32 | 34 | import matplotlib.pyplot as plt |
33 | 35 | import numpy as np |
34 | | -from sklearn.datasets import fetch_openml |
| 36 | +import requests |
| 37 | +import zipfile |
| 38 | +import io |
| 39 | +import pandas as pd |
35 | 40 | from sklearn.ensemble import RandomForestRegressor |
36 | 41 | from sklearn.model_selection import train_test_split |
37 | 42 |
|
|
43 | 48 |
|
44 | 49 | # Parameters |
45 | 50 | features = [ |
46 | | - "MSSubClass", |
47 | | - "LotArea", |
48 | | - "OverallQual", |
49 | | - "OverallCond", |
50 | | - "GarageArea", |
| 51 | + "MS SubClass", |
| 52 | + "Lot Area", |
| 53 | + "Overall Qual", |
| 54 | + "Overall Cond", |
| 55 | + "Garage Area", |
51 | 56 | ] |
| 57 | +target = "SalePrice" |
| 58 | + |
52 | 59 | alpha = 0.05 |
53 | 60 | rf_kwargs = {"n_estimators": 10, "random_state": random_state} |
54 | 61 | model = RandomForestRegressor(**rf_kwargs) |
|
63 | 70 | # in such cases. |
64 | 71 | # Two sub datasets are extracted: the training and test ones. |
65 | 72 |
|
66 | | -X, y = fetch_openml(name="house_prices", return_X_y=True) |
| 73 | +dataset_url = ( |
| 74 | + "https://www.kaggle.com" + |
| 75 | + "/api/v1/datasets/download/shashanknecrothapa/ames-housing-dataset" |
| 76 | +) |
| 77 | +r = requests.get(dataset_url, stream=True) |
| 78 | +with zipfile.ZipFile(io.BytesIO(r.content)) as z: |
| 79 | + with z.open("AmesHousing.csv") as file: |
| 80 | + data = pd.read_csv(file) |
| 81 | + |
| 82 | +X = data[features] |
| 83 | +y = data[target] |
67 | 84 |
|
68 | 85 | X_train, X_test, y_train, y_test = train_test_split( |
69 | 86 | X[features], y, test_size=0.2, random_state=random_state |
|
0 commit comments