Merge branch 'master' into pruning_morph_op_hacktoberfest_joydipb01

joydipb01 · web-flow · commit a9704c4e8a15 · 2025-10-18T12:29:40.000+05:30
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -9,13 +9,7 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-      - run:
-          sudo apt-get update && sudo apt-get install -y libtiff5-dev libjpeg8-dev libopenjp2-7-dev
-          zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev tcl8.6-dev tk8.6-dev python3-tk
-          libharfbuzz-dev libfribidi-dev libxcb1-dev
-          libxml2-dev libxslt-dev
-          libhdf5-dev
-          libopenblas-dev
+      - run: sudo apt-get update && sudo apt-get install -y libhdf5-dev
       - uses: actions/checkout@v5
       - uses: astral-sh/setup-uv@v7
         with:
@@ -32,6 +26,7 @@ jobs:
           --ignore=computer_vision/cnn_classification.py
           --ignore=docs/conf.py
           --ignore=dynamic_programming/k_means_clustering_tensorflow.py
+          --ignore=machine_learning/local_weighted_learning/local_weighted_learning.py
           --ignore=machine_learning/lstm/lstm_prediction.py
           --ignore=neural_network/input_data.py
           --ignore=project_euler/
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -99,7 +99,7 @@ We want your work to be readable by others; therefore, we encourage you to note
   ruff check
   ```
 
-- Original code submission require docstrings or comments to describe your work.
+- Original code submissions require docstrings or comments to describe your work.
 
 - More on docstrings and comments:
 
diff --git a/DIRECTORY.md b/DIRECTORY.md
@@ -626,6 +626,7 @@
   * [Sequential Minimum Optimization](machine_learning/sequential_minimum_optimization.py)
   * [Similarity Search](machine_learning/similarity_search.py)
   * [Support Vector Machines](machine_learning/support_vector_machines.py)
+  * [T Stochastic Neighbour Embedding](machine_learning/t_stochastic_neighbour_embedding.py)
   * [Word Frequency Functions](machine_learning/word_frequency_functions.py)
   * [Xgboost Classifier](machine_learning/xgboost_classifier.py)
   * [Xgboost Regressor](machine_learning/xgboost_regressor.py)
diff --git a/machine_learning/decision_tree.py b/machine_learning/decision_tree.py
@@ -146,14 +146,13 @@ def predict(self, x):
         """
         if self.prediction is not None:
             return self.prediction
-        elif self.left or self.right is not None:
+        elif self.left is not None and self.right is not None:
             if x >= self.decision_boundary:
                 return self.right.predict(x)
             else:
                 return self.left.predict(x)
         else:
-            print("Error: Decision tree not yet trained")
-            return None
+            raise ValueError("Decision tree not yet trained")
 
 
 class TestDecisionTree:
@@ -201,4 +200,4 @@ def main():
     main()
     import doctest
 
-    doctest.testmod(name="mean_squarred_error", verbose=True)
+    doctest.testmod(name="mean_squared_error", verbose=True)
diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py
@@ -0,0 +1,178 @@
+"""
+t-distributed stochastic neighbor embedding (t-SNE)
+
+For more details, see:
+https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding
+"""
+
+import doctest
+
+import numpy as np
+from numpy import ndarray
+from sklearn.datasets import load_iris
+
+
+def collect_dataset() -> tuple[ndarray, ndarray]:
+    """
+    Load the Iris dataset and return features and labels.
+
+    Returns:
+        tuple[ndarray, ndarray]: Feature matrix and target labels.
+
+    >>> features, targets = collect_dataset()
+    >>> features.shape
+    (150, 4)
+    >>> targets.shape
+    (150,)
+    """
+    iris_dataset = load_iris()
+    return np.array(iris_dataset.data), np.array(iris_dataset.target)
+
+
+def compute_pairwise_affinities(data_matrix: ndarray, sigma: float = 1.0) -> ndarray:
+    """
+    Compute high-dimensional affinities (P matrix) using a Gaussian kernel.
+
+    Args:
+        data_matrix: Input data of shape (n_samples, n_features).
+        sigma: Gaussian kernel bandwidth.
+
+    Returns:
+        ndarray: Symmetrized probability matrix.
+
+    >>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
+    >>> probabilities = compute_pairwise_affinities(x)
+    >>> float(round(probabilities[0, 1], 3))
+    0.25
+    """
+    n_samples = data_matrix.shape[0]
+    squared_sum = np.sum(np.square(data_matrix), axis=1)
+    squared_distance = np.add(
+        np.add(-2 * np.dot(data_matrix, data_matrix.T), squared_sum).T, squared_sum
+    )
+
+    affinity_matrix = np.exp(-squared_distance / (2 * sigma**2))
+    np.fill_diagonal(affinity_matrix, 0)
+
+    affinity_matrix /= np.sum(affinity_matrix)
+    return (affinity_matrix + affinity_matrix.T) / (2 * n_samples)
+
+
+def compute_low_dim_affinities(embedding_matrix: ndarray) -> tuple[ndarray, ndarray]:
+    """
+    Compute low-dimensional affinities (Q matrix) using a Student-t distribution.
+
+    Args:
+        embedding_matrix: Low-dimensional embedding of shape (n_samples, n_components).
+
+    Returns:
+        tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix).
+
+    >>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
+    >>> q_matrix, numerators = compute_low_dim_affinities(y)
+    >>> q_matrix.shape
+    (2, 2)
+    """
+    squared_sum = np.sum(np.square(embedding_matrix), axis=1)
+    numerator_matrix = 1 / (
+        1
+        + np.add(
+            np.add(-2 * np.dot(embedding_matrix, embedding_matrix.T), squared_sum).T,
+            squared_sum,
+        )
+    )
+    np.fill_diagonal(numerator_matrix, 0)
+
+    q_matrix = numerator_matrix / np.sum(numerator_matrix)
+    return q_matrix, numerator_matrix
+
+
+def apply_tsne(
+    data_matrix: ndarray,
+    n_components: int = 2,
+    learning_rate: float = 200.0,
+    n_iter: int = 500,
+) -> ndarray:
+    """
+    Apply t-SNE for dimensionality reduction.
+
+    Args:
+        data_matrix: Original dataset (features).
+        n_components: Target dimension (2D or 3D).
+        learning_rate: Step size for gradient descent.
+        n_iter: Number of iterations.
+
+    Returns:
+        ndarray: Low-dimensional embedding of the data.
+
+    >>> features, _ = collect_dataset()
+    >>> embedding = apply_tsne(features, n_components=2, n_iter=50)
+    >>> embedding.shape
+    (150, 2)
+    """
+    if n_components < 1 or n_iter < 1:
+        raise ValueError("n_components and n_iter must be >= 1")
+
+    n_samples = data_matrix.shape[0]
+    rng = np.random.default_rng()
+    embedding = rng.standard_normal((n_samples, n_components)) * 1e-4
+
+    high_dim_affinities = compute_pairwise_affinities(data_matrix)
+    high_dim_affinities = np.maximum(high_dim_affinities, 1e-12)
+
+    embedding_increment = np.zeros_like(embedding)
+    momentum = 0.5
+
+    for iteration in range(n_iter):
+        low_dim_affinities, numerator_matrix = compute_low_dim_affinities(embedding)
+        low_dim_affinities = np.maximum(low_dim_affinities, 1e-12)
+
+        affinity_diff = high_dim_affinities - low_dim_affinities
+
+        gradient = 4 * (
+            np.dot((affinity_diff * numerator_matrix), embedding)
+            - np.multiply(
+                np.sum(affinity_diff * numerator_matrix, axis=1)[:, np.newaxis],
+                embedding,
+            )
+        )
+
+        embedding_increment = momentum * embedding_increment - learning_rate * gradient
+        embedding += embedding_increment
+
+        if iteration == int(n_iter / 4):
+            momentum = 0.8
+
+    return embedding
+
+
+def main() -> None:
+    """
+    Run t-SNE on the Iris dataset and display the first 5 embeddings.
+
+    >>> main()  # doctest: +ELLIPSIS
+    t-SNE embedding (first 5 points):
+    [[...
+    """
+    features, _labels = collect_dataset()
+    embedding = apply_tsne(features, n_components=2, n_iter=300)
+
+    if not isinstance(embedding, np.ndarray):
+        raise TypeError("t-SNE embedding must be an ndarray")
+
+    print("t-SNE embedding (first 5 points):")
+    print(embedding[:5])
+
+    # Optional visualization (Ruff/mypy compliant)
+
+    # import matplotlib.pyplot as plt
+    # plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis")
+    # plt.title("t-SNE Visualization of the Iris Dataset")
+    # plt.xlabel("Dimension 1")
+    # plt.ylabel("Dimension 2")
+    # plt.show()
+
+
+if __name__ == "__main__":
+    doctest.testmod()
+    main()
diff --git a/maths/factorial.py b/maths/factorial.py
@@ -56,7 +56,7 @@ def factorial_recursive(n: int) -> int:
         raise ValueError("factorial() only accepts integral values")
     if n < 0:
         raise ValueError("factorial() not defined for negative values")
-    return 1 if n in {0, 1} else n * factorial(n - 1)
+    return 1 if n in {0, 1} else n * factorial_recursive(n - 1)
 
 
 if __name__ == "__main__":
diff --git a/maths/fibonacci.py b/maths/fibonacci.py
@@ -183,7 +183,7 @@ def fib_memoization(n: int) -> list[int]:
     """
     if n < 0:
         raise ValueError("n is negative")
-    # Cache must be outside recursuive function
+    # Cache must be outside recursive function
     # other it will reset every time it calls itself.
     cache: dict[int, int] = {0: 0, 1: 1, 2: 1}  # Prefilled cache
 
diff --git a/maths/monte_carlo.py b/maths/monte_carlo.py
@@ -8,7 +8,7 @@
 from statistics import mean
 
 
-def pi_estimator(iterations: int):
+def pi_estimator(iterations: int) -> None:
     """
     An implementation of the Monte Carlo method used to find pi.
     1. Draw a 2x2 square centred at (0,0).
diff --git a/maths/volume.py b/maths/volume.py
@@ -555,7 +555,7 @@ def main():
     print(f"Torus: {vol_torus(2, 2) = }")  # ~= 157.9
     print(f"Conical Frustum: {vol_conical_frustum(2, 2, 4) = }")  # ~= 58.6
     print(f"Spherical cap: {vol_spherical_cap(1, 2) = }")  # ~= 5.24
-    print(f"Spheres intersetion: {vol_spheres_intersect(2, 2, 1) = }")  # ~= 21.21
+    print(f"Spheres intersection: {vol_spheres_intersect(2, 2, 1) = }")  # ~= 21.21
     print(f"Spheres union: {vol_spheres_union(2, 2, 1) = }")  # ~= 45.81
     print(
         f"Hollow Circular Cylinder: {vol_hollow_circular_cylinder(1, 2, 3) = }"
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,10 +3,9 @@ name = "thealgorithms-python"
 version = "0.0.1"
 description = "TheAlgorithms in Python"
 authors = [ { name = "TheAlgorithms Contributors" } ]
-requires-python = ">=3.13"
+requires-python = ">=3.14"
 classifiers = [
   "Programming Language :: Python :: 3 :: Only",
-  "Programming Language :: Python :: 3.13",
 ]
 dependencies = [
   "beautifulsoup4>=4.12.3",
@@ -23,6 +22,7 @@ dependencies = [
   "pillow>=11.3",
   "rich>=13.9.4",
   "scikit-learn>=1.5.2",
+  "scipy>=1.16.2",
   "sphinx-pyproject>=0.3",
   "statsmodels>=0.14.4",
   "sympy>=1.13.3",
@@ -48,7 +48,7 @@ euler-validate = [
 ]
 
 [tool.ruff]
-target-version = "py313"
+target-version = "py314"
 
 output-format = "full"
 lint.select = [
@@ -109,7 +109,7 @@ lint.ignore = [
   # `ruff rule S101` for a description of that rule
   "B904",    # Within an `except` clause, raise exceptions with `raise ... from err` -- FIX ME
   "B905",    # `zip()` without an explicit `strict=` parameter -- FIX ME
-  "EM101",   # Exception must not use a string literal, assign to variable first
+  "EM101",   # Exception must not use a string literal, assign to a variable first
   "EXE001",  # Shebang is present but file is not executable -- DO NOT FIX
   "G004",    # Logging statement uses f-string
   "ISC001",  # Conflicts with ruff format -- DO NOT FIX
@@ -125,6 +125,7 @@ lint.ignore = [
   "S311",    # Standard pseudo-random generators are not suitable for cryptographic purposes -- FIX ME
   "SIM905",  # Consider using a list literal instead of `str.split` -- DO NOT FIX
   "SLF001",  # Private member accessed: `_Iterator` -- FIX ME
+  "UP037",   # FIX ME
 ]
 
 lint.per-file-ignores."data_structures/hashing/tests/test_hash_map.py" = [
diff --git a/requirements.txt b/requirements.txt
diff --git a/scripts/README.md b/scripts/README.md
@@ -0,0 +1,27 @@
+Dealing with the onslaught of Hacktoberfest
+* https://hacktoberfest.com
+
+Each year, October brings a swarm of new contributors participating in Hacktoberfest.  This event has its pros and cons, but it presents a monumental workload for the few active maintainers of this repo.  The maintainer workload is further impacted by a new version of CPython being released in the first week of each October.
+
+To help make our algorithms more valuable to visitors, our CONTRIBUTING.md file outlines several strict requirements, such as tests, type hints, descriptive names, functions, and/or classes. Maintainers reviewing pull requests should try to encourage improvements to meet these goals, but when the workload becomes overwhelming (esp. in October), pull requests that do not meet these goals should be closed.
+
+Below are a few [`gh`](https://cli.github.com) scripts that should close pull requests that do not match the definition of an acceptable algorithm as defined in CONTRIBUTING.md.  I tend to run these scripts in the following order.
+
+* close_pull_requests_with_require_descriptive_names.sh
+* close_pull_requests_with_require_tests.sh
+* close_pull_requests_with_require_type_hints.sh
+* close_pull_requests_with_failing_tests.sh
+* close_pull_requests_with_awaiting_changes.sh
+* find_git_conflicts.sh
+
+### Run on 14 Oct 2025: 107 of 541 (19.77%) pull requests closed.
+
+Script run | Open pull requests | Pull requests closed
+--- | --- | ---
+None | 541 | 0
+require_descriptive_names | 515 | 26
+require_tests | 498 | 17
+require_type_hints | 496 | 2
+failing_tests | 438 | ___58___
+awaiting_changes | 434 | 4
+git_conflicts | [ broken ] | 0
diff --git a/sorts/binary_insertion_sort.py b/sorts/binary_insertion_sort.py
@@ -56,7 +56,7 @@ def binary_insertion_sort(collection: list) -> list:
     return collection
 
 
-if __name__ == "__main":
+if __name__ == "__main__":
     user_input = input("Enter numbers separated by a comma:\n").strip()
     try:
         unsorted = [int(item) for item in user_input.split(",")]
diff --git a/sorts/comb_sort.py b/sorts/comb_sort.py
@@ -5,8 +5,7 @@
 Comb sort improves on bubble sort algorithm.
 In bubble sort, distance (or gap) between two compared elements is always one.
 Comb sort improvement is that gap can be much more than 1, in order to prevent slowing
-down by small values
-at the end of a list.
+down by small values at the end of a list.
 
 More info on: https://en.wikipedia.org/wiki/Comb_sort