Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 100 additions & 115 deletions machine_learning/forecasting/run.py
Original file line number Diff line number Diff line change
@@ -1,162 +1,147 @@
"""
this is code for forecasting
but I modified it and used it for safety checker of data
for ex: you have an online shop and for some reason some data are
missing (the amount of data that u expected are not supposed to be)
then we can use it
*ps : 1. ofc we can use normal statistic method but in this case
the data is quite absurd and only a little^^
2. ofc u can use this and modified it for forecasting purpose
for the next 3 months sales or something,
u can just adjust it for ur own purpose
"""
This code forecasts user activity and checks data safety in an online shop context.
It predicts total users based on historical data and checks if the current data is within a safe range.

Check failure on line 3 in machine_learning/forecasting/run.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

machine_learning/forecasting/run.py:3:89: E501 Line too long (103 > 88)

from warnings import simplefilter
You can modify it for various forecasting purposes or for different datasets.

Usage:
- Load your data from a CSV file.
- Ensure the CSV has columns for total users, events, and dates.
"""

import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline

Check failure on line 17 in machine_learning/forecasting/run.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

machine_learning/forecasting/run.py:17:30: F401 `sklearn.pipeline.Pipeline` imported but unused
from sklearn.model_selection import train_test_split, GridSearchCV

Check failure on line 18 in machine_learning/forecasting/run.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

machine_learning/forecasting/run.py:18:55: F401 `sklearn.model_selection.GridSearchCV` imported but unused
from sklearn.svm import SVR
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error

Check failure on line 21 in machine_learning/forecasting/run.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

machine_learning/forecasting/run.py:21:29: F401 `sklearn.metrics.mean_squared_error` imported but unused
from warnings import simplefilter

# Configure logging

Check failure on line 24 in machine_learning/forecasting/run.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

machine_learning/forecasting/run.py:12:1: I001 Import block is un-sorted or un-formatted
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

Check failure on line 25 in machine_learning/forecasting/run.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

machine_learning/forecasting/run.py:25:89: E501 Line too long (91 > 88)

def load_data(file_path: str) -> pd.DataFrame:
"""Load data from a CSV file."""
try:
data = pd.read_csv(file_path)
logging.info("Data loaded successfully.")
return data
except FileNotFoundError:
logging.error("The file was not found.")
raise
except Exception as e:
logging.error(f"Error loading data: {e}")
raise


def linear_regression_prediction(
train_dt: list, train_usr: list, train_mtch: list, test_dt: list, test_mtch: list
) -> float:
"""
First method: linear regression
input : training data (date, total_user, total_event) in list of float
output : list of total user prediction in float
>>> n = linear_regression_prediction([2,3,4,5], [5,3,4,6], [3,1,2,4], [2,1], [2,2])
>>> bool(abs(n - 5.0) < 1e-6) # Checking precision because of floating point errors
True
"""
def normalize_data(data: pd.DataFrame) -> np.ndarray:
"""Normalize the input data."""
return Normalizer().fit_transform(data.values)


def train_test_split_data(normalize_df: np.ndarray) -> tuple:
"""Split the normalized data into training and test sets."""
total_user = normalize_df[:, 0].tolist()
total_match = normalize_df[:, 1].tolist()
total_date = normalize_df[:, 2].tolist()

Check failure on line 51 in machine_learning/forecasting/run.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (W293)

machine_learning/forecasting/run.py:51:1: W293 Blank line contains whitespace
x = normalize_df[:, [1, 2]].tolist()
x_train, x_test = train_test_split(x, test_size=0.2, random_state=42)

Check failure on line 54 in machine_learning/forecasting/run.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (W293)

machine_learning/forecasting/run.py:54:1: W293 Blank line contains whitespace
train_user = total_user[:len(x_train)]
test_user = total_user[len(x_train):]

Check failure on line 57 in machine_learning/forecasting/run.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (W293)

machine_learning/forecasting/run.py:57:1: W293 Blank line contains whitespace
return x_train, x_test, train_user, test_user, total_match[:len(x_train)], total_match[len(x_train):], total_date

Check failure on line 58 in machine_learning/forecasting/run.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

machine_learning/forecasting/run.py:58:89: E501 Line too long (117 > 88)


def linear_regression_prediction(train_dt: list, train_usr: list, train_mtch: list, test_dt: list, test_mtch: list) -> float:
"""Predict total users using linear regression."""
x = np.array([[1, item, train_mtch[i]] for i, item in enumerate(train_dt)])
y = np.array(train_usr)
beta = np.dot(np.dot(np.linalg.inv(np.dot(x.transpose(), x)), x.transpose()), y)
return abs(beta[0] + test_dt[0] * beta[1] + test_mtch[0] + beta[2])

# Compute coefficients using Normal Equation
beta = np.linalg.inv(x.T @ x) @ x.T @ y
return float(beta[0] + test_dt[0] * beta[1] + test_mtch[0] * beta[2])


def sarimax_predictor(train_user: list, train_match: list, test_match: list) -> float:
"""
second method: Sarimax
sarimax is a statistic method which using previous input
and learn its pattern to predict future data
input : training data (total_user, with exog data = total_event) in list of float
output : list of total user prediction in float
>>> sarimax_predictor([4,2,6,8], [3,1,2,4], [2])
6.6666671111109626
"""
# Suppress the User Warning raised by SARIMAX due to insufficient observations
"""Predict total users using SARIMAX."""
simplefilter("ignore", UserWarning)
order = (1, 2, 1)
seasonal_order = (1, 1, 1, 7)
model = SARIMAX(
train_user, exog=train_match, order=order, seasonal_order=seasonal_order
)
seasonal_order = (1, 1, 1, 7) # Weekly seasonality assumed

model = SARIMAX(train_user, exog=train_match, order=order, seasonal_order=seasonal_order)
model_fit = model.fit(disp=False, maxiter=600, method="nm")
result = model_fit.predict(1, len(test_match), exog=[test_match])

result = model_fit.predict(start=len(train_user), end=len(train_user) + len(test_match) - 1, exog=test_match)
return float(result[0])


def support_vector_regressor(x_train: list, x_test: list, train_user: list) -> float:
"""
Third method: Support vector regressor
svr is quite the same with svm(support vector machine)
it uses the same principles as the SVM for classification,
with only a few minor differences and the only different is that
it suits better for regression purpose
input : training data (date, total_user, total_event) in list of float
where x = list of set (date and total event)
output : list of total user prediction in float
>>> support_vector_regressor([[5,2],[1,5],[6,2]], [[3,2]], [2,1,4])
1.634932078116079
"""
"""Predict total users using Support Vector Regressor."""
regressor = SVR(kernel="rbf", C=1, gamma=0.1, epsilon=0.1)
regressor.fit(x_train, train_user)
y_pred = regressor.predict(x_test)
return float(y_pred[0])


def interquartile_range_checker(train_user: list) -> float:
"""
Optional method: interquatile range
input : list of total user in float
output : low limit of input in float
this method can be used to check whether some data is outlier or not
>>> interquartile_range_checker([1,2,3,4,5,6,7,8,9,10])
2.8
"""
train_user.sort()
q1 = np.percentile(train_user, 25)
q3 = np.percentile(train_user, 75)
iqr = q3 - q1
low_lim = q1 - (iqr * 0.1)
return float(low_lim)


def data_safety_checker(list_vote: list, actual_result: float) -> bool:
"""
Used to review all the votes (list result prediction)
and compare it to the actual result.
input : list of predictions
output : print whether it's safe or not
>>> data_safety_checker([2, 3, 4], 5.0)
False
"""
"""Check if predictions are within a safe range compared to the actual result."""
safe = 0
not_safe = 0

if not isinstance(actual_result, float):
raise TypeError("Actual result should be float. Value passed is a list")
if not isinstance(actual_result, (float, int)):
logging.error("Actual result should be float or int.")
raise TypeError("Actual result should be float or int.")

for i in list_vote:
if i > actual_result:
safe = not_safe + 1
elif abs(abs(i) - abs(actual_result)) <= 0.1:
for prediction in list_vote:
if prediction > actual_result:
safe += 1
elif abs(prediction - actual_result) <= 0.1:
safe += 1
else:
not_safe += 1
return safe > not_safe


if __name__ == "__main__":
"""
data column = total user in a day, how much online event held in one day,
what day is that(sunday-saturday)
"""
data_input_df = pd.read_csv("ex_data.csv")

# start normalization
normalize_df = Normalizer().fit_transform(data_input_df.values)
# split data
total_date = normalize_df[:, 2].tolist()
total_user = normalize_df[:, 0].tolist()
total_match = normalize_df[:, 1].tolist()
def plot_results(res_vote: list, actual: float):
"""Plot the predicted vs actual results."""
plt.figure(figsize=(10, 5))
plt.plot(range(len(res_vote)), res_vote, label='Predictions', marker='o')
plt.axhline(y=actual, color='r', linestyle='-', label='Actual Result')
plt.title('Predicted vs Actual User Count')
plt.xlabel('Model')
plt.ylabel('User Count')
plt.xticks(range(len(res_vote)), ['Linear Regression', 'SARIMAX', 'SVR'])
plt.legend()
plt.show()

# for svr (input variable = total date and total match)
x = normalize_df[:, [1, 2]].tolist()
x_train = x[: len(x) - 1]
x_test = x[len(x) - 1 :]

# for linear regression & sarimax
train_date = total_date[: len(total_date) - 1]
train_user = total_user[: len(total_user) - 1]
train_match = total_match[: len(total_match) - 1]
if __name__ == "__main__":
# Load and process data
data_input_df = load_data("ex_data.csv")

# Normalize data
normalize_df = normalize_data(data_input_df)

test_date = total_date[len(total_date) - 1 :]
test_user = total_user[len(total_user) - 1 :]
test_match = total_match[len(total_match) - 1 :]
# Split data into relevant lists
x_train, x_test, train_user, test_user, train_match, test_match, total_date = train_test_split_data(normalize_df)

# voting system with forecasting
# Voting system with forecasting
res_vote = [
linear_regression_prediction(
train_date, train_user, train_match, test_date, test_match
),
linear_regression_prediction(total_date[:len(train_user)], train_user, train_match, total_date[len(train_user):len(train_user)+len(test_user)], test_match),
sarimax_predictor(train_user, train_match, test_match),
support_vector_regressor(x_train, x_test, train_user),
support_vector_regressor(x_train, x_test, train_user)
]

# check the safety of today's data
not_str = "" if data_safety_checker(res_vote, test_user[0]) else "not "
print(f"Today's data is {not_str}safe.")
# Check the safety of today's data
is_safe = data_safety_checker(res_vote, test_user[0])
not_str = "" if is_safe else "not "
logging.info(f"Today's data is {not_str}safe.")

# Plot the results
plot_results(res_vote, test_user[0])