-
-
Notifications
You must be signed in to change notification settings - Fork 48.7k
Added best_random_state_in_random_forest.py file an algorithm to find best random state of random forest classifier #12277
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import pandas as pd | ||
import warnings | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.metrics import accuracy_score | ||
from sklearn.ensemble import RandomForestClassifier | ||
from sklearn.preprocessing import StandardScaler | ||
|
||
warnings.filterwarnings('ignore') | ||
|
||
|
||
def find_best_random_state(data: pd.DataFrame, target_column: str, iterations: int = 200) -> int: | ||
""" | ||
Find the best random state for the Random Forest Classifier that maximizes accuracy. | ||
|
||
Args: | ||
data (pd.DataFrame): The dataset containing features and target variable. | ||
target_column (str): The name of the target column in the dataset. | ||
iterations (int): Number of random states to test. Default is 200. | ||
|
||
Returns: | ||
int: The random state that provides the best accuracy. | ||
""" | ||
# Split dataset into predictors and target | ||
predictors = data.drop(target_column, axis=1) | ||
target = data[target_column] | ||
|
||
# Split dataset into train and test sets | ||
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.20, random_state=0) | ||
Check failure on line 28 in machine_learning/best_random_state_in_random_forest.py
|
||
|
||
|
||
# Scale features | ||
scaler = StandardScaler() | ||
X_train_scaled = scaler.fit_transform(X_train) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the |
||
X_test_scaled = scaler.transform(X_test) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Variable and function names should follow the |
||
|
||
max_accuracy_rf = 0 | ||
best_random_state = 0 | ||
|
||
# Loop through specified random states | ||
for random_state in range(iterations): | ||
rf = RandomForestClassifier(random_state=random_state) | ||
rf.fit(X_train_scaled, y_train) | ||
y_pred_rf = rf.predict(X_test_scaled) | ||
|
||
current_accuracy = round(accuracy_score(y_test, y_pred_rf) * 100, 2) | ||
if current_accuracy > max_accuracy_rf: | ||
max_accuracy_rf = current_accuracy | ||
best_random_state = random_state | ||
|
||
print(f"The best random state is: {best_random_state} with an accuracy score of: {max_accuracy_rf} %") | ||
return best_random_state | ||
|
||
|
||
if __name__ == "__main__": | ||
# Load dataset | ||
dataset = pd.read_csv("heart.csv") | ||
|
||
# Find the best random state | ||
best_state = find_best_random_state(dataset, target_column="target") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As there is no test file in this pull request nor any test function or class in the file
machine_learning/best_random_state_in_random_forest.py
, please provide doctest for the functionfind_best_random_state