Merge pull request #13 from SkBlaz/copilot/fix-12

SkBlaz · web-flow · commit 5e04e3e03dba · 2025-08-17T10:45:48.000+02:00
Improve repository code quality: fix syntax errors, modernize packaging, enhance CI/CD, and resolve test failures
diff --git a/.github/workflows/core-install.yml b/.github/workflows/core-install.yml
@@ -1,7 +1,4 @@
-# This workflow will install Python dependencies, run tests and lint with a single version of Python
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Build
+name: Build and Test
 
 on:
   push:
@@ -11,15 +8,14 @@ on:
 
 jobs:
   build:
-
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python 3.11
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
-        python-version: 3.11
+        python-version: '3.11'
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
@@ -30,7 +26,7 @@ jobs:
         # stop the build if there are Python syntax errors or undefined names
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+        flake8 . --count --exit-zero --max-complexity=15 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
-        cd tests; py.test;
+        python -m pytest tests/ -v
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -1,19 +1,18 @@
-name: Py 3.11
+name: Python 3.11
 
 on: [push]
 
 jobs:
   build:
-
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Set up Python 3.11
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
-        python-version: 3.11
+        python-version: '3.11'
 
     - name: Install dependencies
       run: |
diff --git a/autoBOTLib/__init__.py b/autoBOTLib/__init__.py
@@ -1,3 +1,21 @@
+import os
+import logging
+import nltk
+
+# Configure logging first
+logging.basicConfig(format='%(asctime)s - %(message)s',
+                    datefmt='%d-%b-%y %H:%M:%S')
+logging.getLogger().setLevel(logging.INFO)
+
+# Set environment variables
+os.environ['TOKENIZERS_PARALLELISM'] = "false"
+
+# Download NLTK resources
+nltk.download('stopwords', quiet=True)
+nltk.download('punkt_tab', quiet=True)
+nltk.download('averaged_perceptron_tagger_eng', quiet=True)
+
+# Import all module functionality
 from autoBOTLib.features.features_keyword import *
 from autoBOTLib.features.features_contextual import *
 from autoBOTLib.features.features_token_relations import *
@@ -10,17 +28,3 @@
 from autoBOTLib.optimization.optimization_feature_constructors import *
 from autoBOTLib.optimization.optimization_engine import *
 from autoBOTLib.misc.misc_helpers import *
-
-import nltk
-nltk.download('stopwords', quiet=True)  
-nltk.download('punkt_tab', quiet=True)
-nltk.download('averaged_perceptron_tagger_eng', quiet=True)
-
-import os
-import logging
-
-logging.basicConfig(format='%(asctime)s - %(message)s',
-                    datefmt='%d-%b-%y %H:%M:%S')
-logging.getLogger().setLevel(logging.INFO)
-
-os.environ['TOKENIZERS_PARALLELISM'] = "false"
diff --git a/autoBOTLib/__main__.py b/autoBOTLib/__main__.py
@@ -45,8 +45,7 @@ def main():
         "--framework",
         default="scikit",
         type=str,
-        help=
-        "The computational ML back-end to use. Currently supports scikit (Default) and pyTorch (neural nets for sparse inputs)"
+        help="The computational ML back-end to use. Currently supports scikit (Default) and pyTorch (neural nets for sparse inputs)"
     )
     parser.add_argument("--memory_storage", default="memory", type=str)
     parser.add_argument("--sparsity", default=0.05, type=float)
diff --git a/autoBOTLib/misc/misc_keyword_detection.py b/autoBOTLib/misc/misc_keyword_detection.py
@@ -80,10 +80,8 @@ def corpus_graph(self,
 
         def process_line(line):
 
-            nonlocal G
             nonlocal ctx
             nonlocal reps
-            nonlocal dictionary_with_counts_of_pairs
 
             stop = list(string.punctuation)
             line = line.strip()
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,50 @@
+[project]
+name = "autoBOTLib"
+description = "AutoBOT: Explainable AutoML for texts"
+authors = [
+    {name = "Blaž Škrlj", email = "blaz.skrlj@ijs.si"},
+]
+license = {text = "BSD-3-Clause-Clear"}
+readme = "README.md"
+requires-python = ">=3.8"
+dynamic = ["version", "dependencies", "classifiers", "scripts"]
+
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[tool.flake8]
+max-line-length = 127
+max-complexity = 15
+ignore = [
+    "E402",  # module level import not at top of file (temporarily ignore due to NLTK downloads)
+    "F401",  # imported but unused (temporarily ignore due to wildcard imports)
+    "F403",  # star import used (temporarily ignore while maintaining compatibility)
+    "W503",  # line break before binary operator
+]
+exclude = [
+    ".git",
+    "__pycache__",
+    ".pytest_cache",
+    "build",
+    "dist",
+    "*.egg-info",
+    ".tox",
+    ".venv",
+]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py", "*_test.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = [
+    "-v",
+    "--tb=short",
+]
+
+[tool.isort]
+profile = "black"
+multi_line_output = 3
+line_length = 127
+known_first_party = ["autoBOTLib"]
diff --git a/setup.py b/setup.py
@@ -1,39 +1,22 @@
-from os import path
+from pathlib import Path
 from setuptools import setup, find_packages
-from setuptools.command.install import install
-import subprocess
-import sys
+
 
 def parse_requirements(file):
+    """Parse requirements from requirements.txt file."""
     required_packages = []
-    with open(path.join(path.dirname(__file__), file)) as req_file:
-        for line in req_file:
-            # Exclude any comments or empty lines
-            line = line.strip()
-            if line and not line.startswith("#"):
-                required_packages.append(line)
+    requirements_path = Path(__file__).parent / file
+    try:
+        with open(requirements_path) as req_file:
+            for line in req_file:
+                # Exclude any comments or empty lines
+                line = line.strip()
+                if line and not line.startswith("#"):
+                    required_packages.append(line)
+    except FileNotFoundError:
+        print(f"Warning: {file} not found. Using default requirements.")
     return required_packages
 
-class PostInstallCommand(install):
-    """Post-installation for downloading NLTK resources."""
-    def run(self):
-        install.run(self)
-        
-        try:
-            import nltk
-        except ImportError:
-            print("NLTK is not installed. Installing NLTK...")
-            subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk"])
-            import nltk        
-        try:
-            print("Downloading NLTK 'stopwords' resource...")
-            for lib in ['stopwords', 'punkt_tab', 'averaged_perceptron_tagger_eng']:
-                subprocess.check_call([sys.executable, "-m", "nltk.downloader", lib])
-                print(f"NLTK {lib} downloaded successfully.")
-        except subprocess.CalledProcessError as e:
-            print(f"Failed to download NLTK 'stopwords': {e}")
-            sys.exit(1)  # Exit with error code
-
 long_description = """
 autoBOT is an AutoML system for text classification with an emphasis on explainability.
 It implements the idea of *representation evolution*, learning to combine representations
@@ -58,5 +41,19 @@ def run(self):
     packages=packages,
     zip_safe=False,
     include_package_data=True,
-    install_requires=parse_requirements("requirements.txt")
+    install_requires=parse_requirements("requirements.txt"),
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research", 
+        "License :: OSI Approved :: BSD License",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Text Processing :: Linguistic",
+    ],
+    python_requires=">=3.8",
 )
diff --git a/tests/minimal_functionality_test.py b/tests/minimal_functionality_test.py
@@ -6,11 +6,12 @@
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn import pipeline  ## A necessary import
 import pytest
+import os
 
 
 def test_minimal_mlc():
     ## Load example data frame
-    dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t")
+    dataframe = pd.read_csv("data/insults/train.tsv", sep="\t")
     train_sequences = dataframe['text_a']
     train_targets_c1 = dataframe['label'].values.tolist()
     train_targets_c2 = [
@@ -32,7 +33,7 @@ def test_minimal_mlc():
             strategy="direct-learning"
         )  ## strategy = "direct-learning" trains a single learner.
 
-    dataframe2 = pd.read_csv("../data/insults/test.tsv", sep="\t")
+    dataframe2 = pd.read_csv("data/insults/test.tsv", sep="\t")
     test_sequences = dataframe2['text_a']
     predictions = autoBOTLibObj.predict(test_sequences)
     prob_predictions = autoBOTLibObj.predict_proba(test_sequences)
@@ -45,7 +46,7 @@ def test_minimal_mlc():
 
 def test_minimal():
     ## Load example data frame
-    dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t").iloc[:500]
+    dataframe = pd.read_csv("data/insults/train.tsv", sep="\t").iloc[:500]
     train_sequences = dataframe['text_a']
     train_targets = dataframe['label']
 
@@ -63,7 +64,7 @@ def test_minimal():
             strategy="evolution"
         )  ## strategy = "direct-learning" trains a single learner.
 
-    dataframe2 = pd.read_csv("../data/insults/test.tsv", sep="\t")
+    dataframe2 = pd.read_csv("data/insults/test.tsv", sep="\t")
     test_sequences = dataframe2['text_a']
     predictions = autoBOTLibObj.predict(test_sequences)
     prob_predictions = autoBOTLibObj.predict_proba(test_sequences)
@@ -82,7 +83,7 @@ def test_minimal():
 def test_initializations(fold_number, representation_type, sparsity,
                          time_constraint):
 
-    dataframe = pd.read_csv("../data/insults/train.tsv", sep="\t")
+    dataframe = pd.read_csv("data/insults/train.tsv", sep="\t")
     train_sequences = dataframe['text_a']
     train_targets = dataframe['label']
     autoBOTLibObj = autoBOTLib.GAlearner(