Grigorij-Dudnik · radekrepo · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025 · Jan 28, 2025
diff --git a/non_src/tests/integration_tests/test_llm_in_context.py b/non_src/tests/integration_tests/test_llm_in_context.py
@@ -8,23 +8,28 @@
 import pytest
 
 from single_task_coder import run_clean_coder_pipeline
+from tests.manual_tests.utils_for_tests import cleanup_work_dir, setup_work_dir
 
 logger = logging.getLogger()
 logger.level = logging.INFO
 
 
 @pytest.mark.integration
 def test_llm_no_context(tmp_path: pathlib.Path) -> None:
+# def test_llm_no_context() -> None:
     """Test that the LLM hallucinates and produces incorrect import statement without documentation context."""
     # Given the task for the LLM
     task = '''populate main_dummy.py with code to pull information with PubMedAPIWrapper from langchain,
         to load results to the query "cancer research". Use API key "123412367"'''
     # and given a test work directory as well as .py file
+    # folder_with_project_files = "test_llm_no_context"
+    # setup_work_dir(folder_with_project_files)
     work_dir = tmp_path / "trial"
+    work_dir.mkdir()
     py_file = work_dir / "main_dummy.py"
     content = 'print("hello world")'
     py_file.write_text(content, encoding="utf-8")
-    work_dir.mkdir()
+
     os.environ["WORK_DIR"] = str(work_dir)
     # When starting single coder pipeline and making the LLM call
     run_clean_coder_pipeline(task, str(work_dir))
@@ -35,27 +40,30 @@ def test_llm_no_context(tmp_path: pathlib.Path) -> None:
     with pytest.raises(subprocess.CalledProcessError) as excinfo:
         subprocess.run(command, check=True)
     assert excinfo.value.returncode != 0
+    cleanup_work_dir()
 
 
-@pytest.mark.integration
-def test_llm_rag_context(tmp_path: pathlib.Path) -> None:
-    """Test that an LLM with RAG documentation makes a correct implementation of what is requested."""
-    # Given initial request for the LLM
-    task = '''populate main_dummy.py with code to pull information with PubMedAPIWrapper from langchain,
-        to load results to the query "cancer research". Use API key "123412367"'''
-    # and given a test work directory as well as .py file
-    work_dir = tmp_path / "trial"
-    py_file = work_dir / "main_dummy.py"
-    content = 'print("hello world")'
-    py_file.write_text(content, encoding="utf-8")
-    work_dir.mkdir()
-    os.environ["WORK_DIR"] = str(work_dir)
-    # When starting single coder pipeline and making the LLM call, with RAG
-    run_clean_coder_pipeline(task, str(work_dir),doc_harvest=True)
-    # Then assert that main_dummy.py was modified by the agents
-    assert py_file.read_text(encoding="utf-8") != content
-    # Then assert that the response is not runnable
-    command = ["python", py_file]
-    with pytest.raises(subprocess.CalledProcessError) as excinfo:
-        subprocess.run(command, check=True)
-    assert excinfo.value.returncode == 0
+# @pytest.mark.integration
+# # def test_llm_rag_context(tmp_path: pathlib.Path) -> None:
+# def test_llm_rag_context() -> None:
+#     """Test that an LLM with RAG documentation makes a correct implementation of what is requested."""
+#     # Given initial request for the LLM
+#     task = '''populate main_dummy.py with code to pull information with PubMedAPIWrapper from langchain,
+#         to load results to the query "cancer research". Use API key "123412367"'''
+#     # and given a test work directory as well as .py file
+#     work_dir = tmp_path / "trial"
+#     py_file = work_dir / "main_dummy.py"
+#     content = 'print("hello world")'
+#     py_file.write_text(content, encoding="utf-8")
+#     work_dir.mkdir()
+#     os.environ["WORK_DIR"] = str(work_dir)
+#     # When starting single coder pipeline and making the LLM call, with RAG
+#     run_clean_coder_pipeline(task, str(work_dir),doc_harvest=True)
+#     # Then assert that main_dummy.py was modified by the agents
+#     assert py_file.read_text(encoding="utf-8") != content
+#     # Then assert that the response is not runnable
+#     command = ["python", py_file]
+#     with pytest.raises(subprocess.CalledProcessError) as excinfo:
+#         subprocess.run(command, check=True)
+#     assert excinfo.value.returncode == 0
+#     cleanup_work_dir()
diff --git a/requirements.txt b/requirements.txt
@@ -30,4 +30,6 @@ pyright==1.1.390
 ruff==0.8.2
 httpx==0.27.2
 questionary==2.1.0
-pathspec==0.12.1
+pathspec==0.12.1
+crawl4ai==0.3.744
+setuptools==75.8.0
diff --git a/single_task_coder.py b/single_task_coder.py
@@ -7,12 +7,13 @@
     set_up_env_coder_pipeline()
 
 from src.agents.researcher_agent import Researcher
-from src.agents.doc_harvester import Doc_harvester
+from src.agents.doc_harvester import DocHarvester
 from src.agents.planner_agent import planning
 from src.agents.executor_agent import Executor
 from src.agents.debugger_agent import Debugger
 from src.agents.frontend_feedback import write_screenshot_codes
 import os
+from src.utilities.exceptions import MissingEnvironmentVariableError
 from src.utilities.user_input import user_input
 from src.utilities.start_project_functions import set_up_dot_clean_coder_dir
 from src.utilities.util_functions import create_frontend_feedback_story
@@ -22,13 +23,14 @@
 use_frontend_feedback = bool(os.getenv("FRONTEND_URL"))
 
 
-def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False):
+def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False) -> None:
+    """Single run of clean code pipeline to address a task., from Researcher to Debugger."""
     researcher = Researcher(work_dir)
     file_paths, image_paths = researcher.research_task(task)
     documentation = None
     if doc_harvest:
-        harvester = Doc_harvester()
-        documentation = harvester.find_documentation(task, work_dir)
+        harvester = DocHarvester()
+        documentation = harvester.find_documentation(task)
 
     plan = planning(task, file_paths, image_paths, work_dir, documentation=documentation)
 
@@ -45,7 +47,7 @@ def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False
         file_paths = executor.do_task(task, plan)
 
     human_message = user_input("Please test app and provide commentary if debugging/additional refinement is needed. ")
-    if human_message in ['o', 'ok']:
+    if human_message in ["o", "ok"]:
         return
     debugger = Debugger(
         file_paths, work_dir, human_message,image_paths,  playwright_codes)
@@ -57,5 +59,6 @@ def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False
     set_up_dot_clean_coder_dir(work_dir)
     task = user_input("Provide task to be executed. ")
     if not work_dir:
-        raise Exception("WORK_DIR variable not provided. Please add WORK_DIR to .env file")
-    run_clean_coder_pipeline(task, work_dir)
+        msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file"
+        raise MissingEnvironmentVariableError(msg)
+    run_clean_coder_pipeline(task, work_dir)
diff --git a/src/agents/doc_harvester.py b/src/agents/doc_harvester.py
@@ -1,11 +1,141 @@
-"""Documentation harvester pulls relevant documentation for the task by user of the pipeline."""
+"""Documentation harvester pulls relevant docs for the task set by the user of the pipeline."""
 
-from typing import Union
+import importlib
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.models import CrawlResult
+from dotenv import find_dotenv, load_dotenv
+
+from src.tools.rag.write_descriptions import produce_descriptions, upload_descriptions_to_vdb
+from src.tools.tools_doc_harvester import PythonLibraries
+from src.utilities.exceptions import MissingEnvironmentVariableError, ModuleImportedButNotLocatedError
+from src.utilities.llms import init_llms_mini
+from src.utilities.util_functions import join_paths
+
+load_dotenv(find_dotenv())
+
+# playwright install --with-deps chromium
+
+async def pull_webpage(url: str) -> CrawlResult:
+    """Pulls URL information."""
+    async with AsyncWebCrawler() as crawler:
+        return await crawler.arun(
+            url=url,
+        )
+
+
+class DocHarvester:
+    """
+    Agent for collecting documentation relevant to user's task. Requires internet access.
+
+    More description.
+
+    Attributes
+    ----------
+    work_dir: str
+        Location of the project that Clean Coder pipeline operates on.
+
+    Methods
+    -------
+        find_documentation(task: str)
+
+    Examples
+    --------
+        dh = DocHarvester()
+        task = "prepare a scraper of a website"
+        dh.find_documentation(task=task)
+    """
 
-class Doc_harvester:
     def __init__(self) -> None:
-        """Initial information to help harvest documentation from the internet."""
-        pass
-    def find_documentation(self, task: str, work_dir: str) -> Union[None, list[str]]:
-        """Returns documentation relevant for the task set by human user."""
-        return None
+        """Initial information to help harvest documentation."""
+        work_dir = os.getenv("WORK_DIR")
+        if not work_dir:
+            msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file"
+            raise MissingEnvironmentVariableError(msg)
+        self.work_dir = work_dir
+        llms_mini = init_llms_mini(run_name="DocHarvester")
+        self.llm_mini = llms_mini[0]
+
+    def identify_libraries(self, task: str) -> list[str]:
+        """Library names relevant for user's task. An LLM task."""
+        # TODO: generalise to cross-language
+        # TODO: use google search engine. Good prompts.
+        structured_llm = self.llm_mini.with_structured_output(PythonLibraries)
+        return structured_llm.invoke(task).libraries
+
+    def locate_module_files(self, lib: str) -> Path:
+        """Identify locations where module scripts are stored."""
+        imported = importlib.import_module(lib)
+        if imported.__file__:
+            return Path(imported.__file__).parent
+        msg = f"'{lib}' imported but not found."
+        raise ModuleImportedButNotLocatedError(msg)
+
+    def identify_documentation(self, libraries: list[str]) -> dict[str, str]:
+        """Find files of software packages useful for the task, including docstrings."""
+         # TODO: generalise to cross-language. Package managers for key languages. Browser-based for other languages.
+         # UnimplementedError for languages not supported.
+        installed = {pkg.metadata["name"] for pkg in importlib.metadata.distributions()}
+        missing = set(libraries) - installed
+        if missing:
+            # install lib
+            python = sys.executable
+            subprocess.check_call([python, "-m", "pip", "install", *missing], stdout=subprocess.DEVNULL)
+        lib_documentation = {}
+        for lib in libraries:
+            lib_documentation[lib] = self.locate_module_files(lib=lib)
+        return lib_documentation
+
+    def indexed_data(self, rag_input: dict[str, str]) -> None:
+        """Prepare RAG-ready data from scripts in directories indicated in the input."""
+        file_description_dir = join_paths(self.work_dir, ".clean_coder/lib_documentation_descriptions")
+        file_extension_constraint = {
+            ".js",
+            ".jsx",
+            ".ts",
+            ".tsx",
+            ".vue",
+            ".py",
+            ".rb",
+            ".php",
+            ".java",
+            ".c",
+            ".cpp",
+            ".cs",
+            ".go",
+            ".swift",
+            ".kt",
+            ".rs",
+            ".htm",
+            ".html",
+            ".css",
+            ".scss",
+            ".sass",
+            ".less",
+            ".prompt",
+        }
+        ignore = {".clean_coder", ".coderrules"}
+        produce_descriptions(
+            directories_with_files_to_describe=list(rag_input.values()),
+            file_description_dir=file_description_dir,
+            work_dir=self.work_dir,
+            file_extension_constraint=file_extension_constraint,
+            ignore=ignore,
+            )
+        chroma_collection_name = f"clean_coder_{Path(self.work_dir).name}_lib_documentation_descriptions"
+        upload_descriptions_to_vdb(
+            chroma_collection_name=chroma_collection_name,
+            work_dir=self.work_dir,
+            file_description_dir=file_description_dir,
+        )
+
+
+    def rag_documentation(self, task: str) -> None | list[str]:
+        """Returns documentation relevant for the task set by human user, a list of files."""
+        libraries = self.identify_libraries(task=task)
+        rag_input = self.identify_documentation(libraries=libraries)
+        return self.indexed_data(rag_input=rag_input)
diff --git a/src/tools/rag/retrieval.py b/src/tools/rag/retrieval.py
@@ -1,19 +1,26 @@
+"""Functions to retrieve the most relevant documents from an indexed RAG database."""
 import os
-import cohere
-import chromadb
 from pathlib import Path
-from dotenv import load_dotenv, find_dotenv
 
+import chromadb
+import cohere
+from dotenv import find_dotenv, load_dotenv
+
+from src.utilities.exceptions import MissingEnvironmentVariableError
 
 load_dotenv(find_dotenv())
 work_dir = os.getenv("WORK_DIR")
+if not work_dir:
+    msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file"
+    raise MissingEnvironmentVariableError(msg)
 cohere_key = os.getenv("COHERE_API_KEY")
 if cohere_key:
     cohere_client = cohere.Client(cohere_key)
 collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions"
 
 
-def get_collection():
+def get_collection() -> bool | chromadb.PersistentClient:
+    """Check if chroma database is available in WORK_DIR."""
     if cohere_key:
         chroma_client = chromadb.PersistentClient(path=os.getenv('WORK_DIR') + '/.clean_coder/chroma_base')
         try:
@@ -28,7 +35,8 @@ def vdb_available():
     return True if get_collection() else False
 
 
-def retrieve(question):
+def retrieve(question: str) -> str:
+    """Identifies the most relevant files that help answer a question."""
     # collection should be initialized once, in the class init
     collection = get_collection()
     retrieval = collection.query(query_texts=[question], n_results=8)