diff --git a/machine_learning/forecasting/run.py b/machine_learning/forecasting/run.py index 9d81b03cd09e..7ac60dd464e0 100644 --- a/machine_learning/forecasting/run.py +++ b/machine_learning/forecasting/run.py @@ -1,162 +1,147 @@ """ -this is code for forecasting -but I modified it and used it for safety checker of data -for ex: you have an online shop and for some reason some data are -missing (the amount of data that u expected are not supposed to be) - then we can use it -*ps : 1. ofc we can use normal statistic method but in this case - the data is quite absurd and only a little^^ - 2. ofc u can use this and modified it for forecasting purpose - for the next 3 months sales or something, - u can just adjust it for ur own purpose -""" +This code forecasts user activity and checks data safety in an online shop context. +It predicts total users based on historical data and checks if the current data is within a safe range. -from warnings import simplefilter +You can modify it for various forecasting purposes or for different datasets. + +Usage: +- Load your data from a CSV file. +- Ensure the CSV has columns for total users, events, and dates. +""" +import logging import numpy as np import pandas as pd +import matplotlib.pyplot as plt from sklearn.preprocessing import Normalizer +from sklearn.pipeline import Pipeline +from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.svm import SVR from statsmodels.tsa.statespace.sarimax import SARIMAX +from sklearn.metrics import mean_squared_error +from warnings import simplefilter + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +def load_data(file_path: str) -> pd.DataFrame: + """Load data from a CSV file.""" + try: + data = pd.read_csv(file_path) + logging.info("Data loaded successfully.") + return data + except FileNotFoundError: + logging.error("The file was not found.") + raise + except Exception as e: + logging.error(f"Error loading data: {e}") + raise -def linear_regression_prediction( - train_dt: list, train_usr: list, train_mtch: list, test_dt: list, test_mtch: list -) -> float: - """ - First method: linear regression - input : training data (date, total_user, total_event) in list of float - output : list of total user prediction in float - >>> n = linear_regression_prediction([2,3,4,5], [5,3,4,6], [3,1,2,4], [2,1], [2,2]) - >>> bool(abs(n - 5.0) < 1e-6) # Checking precision because of floating point errors - True - """ +def normalize_data(data: pd.DataFrame) -> np.ndarray: + """Normalize the input data.""" + return Normalizer().fit_transform(data.values) + + +def train_test_split_data(normalize_df: np.ndarray) -> tuple: + """Split the normalized data into training and test sets.""" + total_user = normalize_df[:, 0].tolist() + total_match = normalize_df[:, 1].tolist() + total_date = normalize_df[:, 2].tolist() + + x = normalize_df[:, [1, 2]].tolist() + x_train, x_test = train_test_split(x, test_size=0.2, random_state=42) + + train_user = total_user[:len(x_train)] + test_user = total_user[len(x_train):] + + return x_train, x_test, train_user, test_user, total_match[:len(x_train)], total_match[len(x_train):], total_date + + +def linear_regression_prediction(train_dt: list, train_usr: list, train_mtch: list, test_dt: list, test_mtch: list) -> float: + """Predict total users using linear regression.""" x = np.array([[1, item, train_mtch[i]] for i, item in enumerate(train_dt)]) y = np.array(train_usr) - beta = np.dot(np.dot(np.linalg.inv(np.dot(x.transpose(), x)), x.transpose()), y) - return abs(beta[0] + test_dt[0] * beta[1] + test_mtch[0] + beta[2]) + + # Compute coefficients using Normal Equation + beta = np.linalg.inv(x.T @ x) @ x.T @ y + return float(beta[0] + test_dt[0] * beta[1] + test_mtch[0] * beta[2]) def sarimax_predictor(train_user: list, train_match: list, test_match: list) -> float: - """ - second method: Sarimax - sarimax is a statistic method which using previous input - and learn its pattern to predict future data - input : training data (total_user, with exog data = total_event) in list of float - output : list of total user prediction in float - >>> sarimax_predictor([4,2,6,8], [3,1,2,4], [2]) - 6.6666671111109626 - """ - # Suppress the User Warning raised by SARIMAX due to insufficient observations + """Predict total users using SARIMAX.""" simplefilter("ignore", UserWarning) order = (1, 2, 1) - seasonal_order = (1, 1, 1, 7) - model = SARIMAX( - train_user, exog=train_match, order=order, seasonal_order=seasonal_order - ) + seasonal_order = (1, 1, 1, 7) # Weekly seasonality assumed + + model = SARIMAX(train_user, exog=train_match, order=order, seasonal_order=seasonal_order) model_fit = model.fit(disp=False, maxiter=600, method="nm") - result = model_fit.predict(1, len(test_match), exog=[test_match]) + + result = model_fit.predict(start=len(train_user), end=len(train_user) + len(test_match) - 1, exog=test_match) return float(result[0]) def support_vector_regressor(x_train: list, x_test: list, train_user: list) -> float: - """ - Third method: Support vector regressor - svr is quite the same with svm(support vector machine) - it uses the same principles as the SVM for classification, - with only a few minor differences and the only different is that - it suits better for regression purpose - input : training data (date, total_user, total_event) in list of float - where x = list of set (date and total event) - output : list of total user prediction in float - >>> support_vector_regressor([[5,2],[1,5],[6,2]], [[3,2]], [2,1,4]) - 1.634932078116079 - """ + """Predict total users using Support Vector Regressor.""" regressor = SVR(kernel="rbf", C=1, gamma=0.1, epsilon=0.1) regressor.fit(x_train, train_user) y_pred = regressor.predict(x_test) return float(y_pred[0]) -def interquartile_range_checker(train_user: list) -> float: - """ - Optional method: interquatile range - input : list of total user in float - output : low limit of input in float - this method can be used to check whether some data is outlier or not - >>> interquartile_range_checker([1,2,3,4,5,6,7,8,9,10]) - 2.8 - """ - train_user.sort() - q1 = np.percentile(train_user, 25) - q3 = np.percentile(train_user, 75) - iqr = q3 - q1 - low_lim = q1 - (iqr * 0.1) - return float(low_lim) - - def data_safety_checker(list_vote: list, actual_result: float) -> bool: - """ - Used to review all the votes (list result prediction) - and compare it to the actual result. - input : list of predictions - output : print whether it's safe or not - >>> data_safety_checker([2, 3, 4], 5.0) - False - """ + """Check if predictions are within a safe range compared to the actual result.""" safe = 0 not_safe = 0 - if not isinstance(actual_result, float): - raise TypeError("Actual result should be float. Value passed is a list") + if not isinstance(actual_result, (float, int)): + logging.error("Actual result should be float or int.") + raise TypeError("Actual result should be float or int.") - for i in list_vote: - if i > actual_result: - safe = not_safe + 1 - elif abs(abs(i) - abs(actual_result)) <= 0.1: + for prediction in list_vote: + if prediction > actual_result: + safe += 1 + elif abs(prediction - actual_result) <= 0.1: safe += 1 else: not_safe += 1 return safe > not_safe -if __name__ == "__main__": - """ - data column = total user in a day, how much online event held in one day, - what day is that(sunday-saturday) - """ - data_input_df = pd.read_csv("ex_data.csv") - - # start normalization - normalize_df = Normalizer().fit_transform(data_input_df.values) - # split data - total_date = normalize_df[:, 2].tolist() - total_user = normalize_df[:, 0].tolist() - total_match = normalize_df[:, 1].tolist() +def plot_results(res_vote: list, actual: float): + """Plot the predicted vs actual results.""" + plt.figure(figsize=(10, 5)) + plt.plot(range(len(res_vote)), res_vote, label='Predictions', marker='o') + plt.axhline(y=actual, color='r', linestyle='-', label='Actual Result') + plt.title('Predicted vs Actual User Count') + plt.xlabel('Model') + plt.ylabel('User Count') + plt.xticks(range(len(res_vote)), ['Linear Regression', 'SARIMAX', 'SVR']) + plt.legend() + plt.show() - # for svr (input variable = total date and total match) - x = normalize_df[:, [1, 2]].tolist() - x_train = x[: len(x) - 1] - x_test = x[len(x) - 1 :] - # for linear regression & sarimax - train_date = total_date[: len(total_date) - 1] - train_user = total_user[: len(total_user) - 1] - train_match = total_match[: len(total_match) - 1] +if __name__ == "__main__": + # Load and process data + data_input_df = load_data("ex_data.csv") + + # Normalize data + normalize_df = normalize_data(data_input_df) - test_date = total_date[len(total_date) - 1 :] - test_user = total_user[len(total_user) - 1 :] - test_match = total_match[len(total_match) - 1 :] + # Split data into relevant lists + x_train, x_test, train_user, test_user, train_match, test_match, total_date = train_test_split_data(normalize_df) - # voting system with forecasting + # Voting system with forecasting res_vote = [ - linear_regression_prediction( - train_date, train_user, train_match, test_date, test_match - ), + linear_regression_prediction(total_date[:len(train_user)], train_user, train_match, total_date[len(train_user):len(train_user)+len(test_user)], test_match), sarimax_predictor(train_user, train_match, test_match), - support_vector_regressor(x_train, x_test, train_user), + support_vector_regressor(x_train, x_test, train_user) ] - # check the safety of today's data - not_str = "" if data_safety_checker(res_vote, test_user[0]) else "not " - print(f"Today's data is {not_str}safe.") + # Check the safety of today's data + is_safe = data_safety_checker(res_vote, test_user[0]) + not_str = "" if is_safe else "not " + logging.info(f"Today's data is {not_str}safe.") + + # Plot the results + plot_results(res_vote, test_user[0])