Skip to content

Commit 614f6c1

Browse files
authored
docs: Add comprehensive documentation to linear_regression.py
This PR addresses issue #13919 by improving documentation for linear_regression.py: - Added comprehensive module-level docstring with mathematical foundation - Documented time and space complexity for all functions - Added type hints using numpy.typing.NDArray - Enhanced docstrings with Args, Returns, Raises sections - Added additional doctests for edge cases - Included references to Wikipedia articles
1 parent a051ab5 commit 614f6c1

File tree

1 file changed

+164
-61
lines changed

1 file changed

+164
-61
lines changed

machine_learning/linear_regression.py

Lines changed: 164 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,28 @@
1-
"""
2-
Linear regression is the most basic type of regression commonly used for
3-
predictive analysis. The idea is pretty simple: we have a dataset and we have
4-
features associated with it. Features should be chosen very cautiously
5-
as they determine how much our model will be able to make future predictions.
6-
We try to set the weight of these features, over many iterations, so that they best
7-
fit our dataset. In this particular code, I had used a CSGO dataset (ADR vs
8-
Rating). We try to best fit a line through dataset and estimate the parameters.
1+
"""Linear Regression Implementation.
2+
3+
Linear regression is a fundamental supervised machine learning algorithm used for
4+
predictive analysis. It models the relationship between a dependent variable (y)
5+
and one or more independent variables (x) by fitting a linear equation.
6+
7+
Mathematical Foundation:
8+
The model assumes: y = θ₀ + θ₁x₁ + θ₂x₂ + ... + θₙxₙ + ε
9+
where θ are the parameters (weights) and ε is the error term.
10+
11+
The cost function (Mean Squared Error) is minimized using gradient descent:
12+
J(θ) = (1/2m) * Σ(h(x⁽ⁱ⁾) - y⁽ⁱ⁾)²
13+
14+
Gradient descent update rule:
15+
θⱼ := θⱼ - α * (∂J/∂θⱼ)
16+
17+
Time Complexity:
18+
- Training: O(n * m * iterations) where n = features, m = samples
19+
- Prediction: O(n) per sample
20+
21+
Space Complexity: O(n * m) for storing the dataset
22+
23+
References:
24+
- https://en.wikipedia.org/wiki/Linear_regression
25+
- https://en.wikipedia.org/wiki/Gradient_descent
926
"""
1027

1128
# /// script
@@ -18,12 +35,26 @@
1835

1936
import httpx
2037
import numpy as np
38+
from numpy.typing import NDArray
39+
40+
41+
def collect_dataset() -> NDArray:
42+
"""Collect dataset of CSGO player statistics.
2143
44+
Fetches a CSV dataset containing ADR (Average Damage per Round) vs Rating
45+
of CSGO players from an external source.
2246
23-
def collect_dataset():
24-
"""Collect dataset of CSGO
25-
The dataset contains ADR vs Rating of a Player
26-
:return : dataset obtained from the link, as matrix
47+
Returns:
48+
NDArray: A numpy matrix containing the dataset with ADR and Rating values.
49+
50+
Raises:
51+
httpx.TimeoutException: If the request times out after 10 seconds.
52+
httpx.HTTPError: If there's an error fetching the dataset.
53+
54+
Example:
55+
>>> dataset = collect_dataset() # doctest: +SKIP
56+
>>> dataset.shape[1] == 2 # doctest: +SKIP
57+
True
2758
"""
2859
response = httpx.get(
2960
"https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/"
@@ -35,51 +66,89 @@ def collect_dataset():
3566
for item in lines:
3667
item = item.split(",")
3768
data.append(item)
38-
data.pop(0) # This is for removing the labels from the list
69+
data.pop(0) # Remove the header labels
3970
dataset = np.matrix(data)
4071
return dataset
4172

4273

43-
def run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta):
44-
"""Run steep gradient descent and updates the Feature vector accordingly_
45-
:param data_x : contains the dataset
46-
:param data_y : contains the output associated with each data-entry
47-
:param len_data : length of the data_
48-
:param alpha : Learning rate of the model
49-
:param theta : Feature vector (weight's for our model)
50-
;param return : Updated Feature's, using
51-
curr_features - alpha_ * gradient(w.r.t. feature)
52-
>>> import numpy as np
53-
>>> data_x = np.array([[1, 2], [3, 4]])
54-
>>> data_y = np.array([5, 6])
55-
>>> len_data = len(data_x)
56-
>>> alpha = 0.01
57-
>>> theta = np.array([0.1, 0.2])
58-
>>> run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta)
59-
array([0.196, 0.343])
74+
def run_steep_gradient_descent(
75+
data_x: NDArray,
76+
data_y: NDArray,
77+
len_data: int,
78+
alpha: float,
79+
theta: NDArray,
80+
) -> NDArray:
81+
"""Perform one iteration of gradient descent to update feature weights.
82+
83+
Gradient descent is an optimization algorithm that iteratively adjusts
84+
parameters to minimize the cost function.
85+
86+
Args:
87+
data_x: Input feature matrix of shape (m, n) where m = samples, n = features.
88+
data_y: Target values array of shape (m,).
89+
len_data: Number of training samples.
90+
alpha: Learning rate controlling the step size (typically 0.001 to 0.1).
91+
theta: Current weight vector of shape (1, n).
92+
93+
Returns:
94+
NDArray: Updated weight vector after one gradient descent step.
95+
96+
Time Complexity: O(m * n) for matrix operations.
97+
Space Complexity: O(m * n) for intermediate calculations.
98+
99+
Example:
100+
>>> import numpy as np
101+
>>> data_x = np.array([[1, 2], [3, 4]])
102+
>>> data_y = np.array([5, 6])
103+
>>> len_data = len(data_x)
104+
>>> alpha = 0.01
105+
>>> theta = np.array([0.1, 0.2])
106+
>>> run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta)
107+
array([0.196, 0.343])
60108
"""
61109
n = len_data
62-
63110
prod = np.dot(theta, data_x.transpose())
64111
prod -= data_y.transpose()
65112
sum_grad = np.dot(prod, data_x)
66113
theta = theta - (alpha / n) * sum_grad
67114
return theta
68115

69116

70-
def sum_of_square_error(data_x, data_y, len_data, theta):
71-
"""Return sum of square error for error calculation
72-
:param data_x : contains our dataset
73-
:param data_y : contains the output (result vector)
74-
:param len_data : len of the dataset
75-
:param theta : contains the feature vector
76-
:return : sum of square error computed from given feature's
117+
def sum_of_square_error(
118+
data_x: NDArray,
119+
data_y: NDArray,
120+
len_data: int,
121+
theta: NDArray,
122+
) -> float:
123+
"""Calculate the Sum of Squared Errors (SSE) for the current model.
124+
125+
SSE measures how well the model fits the data by computing the sum of
126+
squared differences between predicted and actual values.
127+
128+
Args:
129+
data_x: Input feature matrix of shape (m, n).
130+
data_y: Actual target values of shape (m,).
131+
len_data: Number of data samples.
132+
theta: Current weight vector of shape (1, n).
133+
134+
Returns:
135+
float: The mean squared error value (SSE divided by 2m).
136+
137+
Time Complexity: O(m * n) for prediction and error calculation.
138+
Space Complexity: O(m) for storing predictions.
77139
78140
Example:
79-
>>> vc_x = np.array([[1.1], [2.1], [3.1]])
80-
>>> vc_y = np.array([1.2, 2.2, 3.2])
81-
>>> round(sum_of_square_error(vc_x, vc_y, 3, np.array([1])),3)
82-
np.float64(0.005)
141+
>>> import numpy as np
142+
>>> vc_x = np.array([[1.1], [2.1], [3.1]])
143+
>>> vc_y = np.array([1.2, 2.2, 3.2])
144+
>>> round(sum_of_square_error(vc_x, vc_y, 3, np.array([1])), 3)
145+
np.float64(0.005)
146+
147+
>>> # Test with perfect fit
148+
>>> x = np.array([[1], [2], [3]])
149+
>>> y = np.array([1, 2, 3])
150+
>>> sum_of_square_error(x, y, 3, np.array([1]))
151+
np.float64(0.0)
83152
"""
84153
prod = np.dot(theta, data_x.transpose())
85154
prod -= data_y.transpose()
@@ -88,18 +157,30 @@ def sum_of_square_error(data_x, data_y, len_data, theta):
88157
return error
89158

90159

91-
def run_linear_regression(data_x, data_y):
92-
"""Implement Linear regression over the dataset
93-
:param data_x : contains our dataset
94-
:param data_y : contains the output (result vector)
95-
:return : feature for line of best fit (Feature vector)
160+
def run_linear_regression(data_x: NDArray, data_y: NDArray) -> NDArray:
161+
"""Train a linear regression model using gradient descent.
162+
163+
Iteratively optimizes the weight parameters to minimize the cost function
164+
(mean squared error) over the training data.
165+
166+
Args:
167+
data_x: Input feature matrix of shape (m, n).
168+
data_y: Target values of shape (m,).
169+
170+
Returns:
171+
NDArray: Optimized weight vector (theta) of shape (1, n).
172+
173+
Time Complexity: O(iterations * m * n) where default iterations = 100000.
174+
Space Complexity: O(m * n) for storing the dataset.
175+
176+
Note:
177+
The learning rate (alpha) is set to 0.0001550 and may need tuning
178+
for different datasets.
96179
"""
97180
iterations = 100000
98181
alpha = 0.0001550
99-
100182
no_features = data_x.shape[1]
101183
len_data = data_x.shape[0] - 1
102-
103184
theta = np.zeros((1, no_features))
104185

105186
for i in range(iterations):
@@ -110,25 +191,47 @@ def run_linear_regression(data_x, data_y):
110191
return theta
111192

112193

113-
def mean_absolute_error(predicted_y, original_y):
114-
"""Return sum of square error for error calculation
115-
:param predicted_y : contains the output of prediction (result vector)
116-
:param original_y : contains values of expected outcome
117-
:return : mean absolute error computed from given feature's
194+
def mean_absolute_error(predicted_y: list, original_y: list) -> float:
195+
"""Calculate Mean Absolute Error (MAE) between predicted and actual values.
196+
197+
MAE is a common metric for regression models that measures the average
198+
magnitude of errors without considering direction.
199+
200+
Args:
201+
predicted_y: List of predicted values.
202+
original_y: List of actual/expected values.
203+
204+
Returns:
205+
float: The mean absolute error.
206+
207+
Time Complexity: O(n) where n is the number of samples.
208+
Space Complexity: O(1) for accumulator.
118209
119-
>>> predicted_y = [3, -0.5, 2, 7]
120-
>>> original_y = [2.5, 0.0, 2, 8]
121-
>>> mean_absolute_error(predicted_y, original_y)
122-
0.5
210+
Example:
211+
>>> predicted_y = [3, -0.5, 2, 7]
212+
>>> original_y = [2.5, 0.0, 2, 8]
213+
>>> mean_absolute_error(predicted_y, original_y)
214+
0.5
215+
216+
>>> # Test with identical values (perfect prediction)
217+
>>> mean_absolute_error([1, 2, 3], [1, 2, 3])
218+
0.0
219+
220+
>>> # Test with negative values
221+
>>> mean_absolute_error([-1, -2], [1, 2])
222+
3.0
123223
"""
124224
total = sum(abs(y - predicted_y[i]) for i, y in enumerate(original_y))
125225
return total / len(original_y)
126226

127227

128-
def main():
129-
"""Driver function"""
130-
data = collect_dataset()
228+
def main() -> None:
229+
"""Driver function to demonstrate linear regression.
131230
231+
Loads the CSGO dataset, trains a linear regression model,
232+
and prints the resulting feature vector.
233+
"""
234+
data = collect_dataset()
132235
len_data = data.shape[0]
133236
data_x = np.c_[np.ones(len_data), data[:, :-1]].astype(float)
134237
data_y = data[:, -1].astype(float)

0 commit comments

Comments
 (0)