Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
5d72058
adapt single task coder to DocHarvester. Minor good practice corrections
radekrepo Jan 28, 2025
f3bbf86
adapt single task coder to DocHarvester. Minor good practice corrections
radekrepo Jan 28, 2025
d5bf5a7
function frame up to vectorization
radekrepo Jan 28, 2025
a301320
documentation collection step ready
radekrepo Jan 28, 2025
24c0d9c
Merge remote-tracking branch 'remotes/upstream/dev' into rag_llm_agen…
radekrepo Feb 4, 2025
0750682
identification of libraries
radekrepo Feb 6, 2025
1f91041
no fewer than 3 libraries to return
radekrepo Feb 6, 2025
5b1e28b
styling & documentation update
radekrepo Feb 6, 2025
de8c76d
styling & documentation update
radekrepo Feb 6, 2025
9ebe57f
add descriptions, organise imports
radekrepo Feb 7, 2025
d396376
styling fixes
radekrepo Feb 10, 2025
0ad34a8
styling fixes, documentation
radekrepo Feb 10, 2025
71f2938
refactor to generalise usage
radekrepo Feb 11, 2025
f73b134
include files from work_dir itself when considering files for describ…
radekrepo Feb 11, 2025
4f5874b
document two main functions
radekrepo Feb 11, 2025
f65fa0f
document two main functions
radekrepo Feb 11, 2025
80f5c9e
debugged
radekrepo Feb 11, 2025
689beaa
black reformat
radekrepo Feb 11, 2025
3024489
checkpoint
radekrepo Feb 11, 2025
102f212
Merge remote-tracking branch 'remotes/upstream/dev' into rag_llm_agen…
radekrepo Feb 11, 2025
a1bf8ca
indexing script complete
radekrepo Feb 11, 2025
d2584b4
Merge remote-tracking branch 'remotes/upstream/dev' into rag_llm_agen…
radekrepo Feb 18, 2025
31a7117
remove work_dir kwarg
radekrepo Feb 18, 2025
7b10eea
rename function
radekrepo Feb 18, 2025
3dff26c
re-introduce file_folder_ignored function
radekrepo Feb 18, 2025
daea4f1
add explanation of file_folder_ignored
radekrepo Feb 18, 2025
8658319
organize imports to match best practice
radekrepo Feb 18, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 31 additions & 23 deletions non_src/tests/integration_tests/test_llm_in_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,28 @@
import pytest

from single_task_coder import run_clean_coder_pipeline
from tests.manual_tests.utils_for_tests import cleanup_work_dir, setup_work_dir

logger = logging.getLogger()
logger.level = logging.INFO


@pytest.mark.integration
def test_llm_no_context(tmp_path: pathlib.Path) -> None:
# def test_llm_no_context() -> None:
"""Test that the LLM hallucinates and produces incorrect import statement without documentation context."""
# Given the task for the LLM
task = '''populate main_dummy.py with code to pull information with PubMedAPIWrapper from langchain,
to load results to the query "cancer research". Use API key "123412367"'''
# and given a test work directory as well as .py file
# folder_with_project_files = "test_llm_no_context"
# setup_work_dir(folder_with_project_files)
work_dir = tmp_path / "trial"
work_dir.mkdir()
py_file = work_dir / "main_dummy.py"
content = 'print("hello world")'
py_file.write_text(content, encoding="utf-8")
work_dir.mkdir()

os.environ["WORK_DIR"] = str(work_dir)
# When starting single coder pipeline and making the LLM call
run_clean_coder_pipeline(task, str(work_dir))
Expand All @@ -35,27 +40,30 @@ def test_llm_no_context(tmp_path: pathlib.Path) -> None:
with pytest.raises(subprocess.CalledProcessError) as excinfo:
subprocess.run(command, check=True)
assert excinfo.value.returncode != 0
cleanup_work_dir()


@pytest.mark.integration
def test_llm_rag_context(tmp_path: pathlib.Path) -> None:
"""Test that an LLM with RAG documentation makes a correct implementation of what is requested."""
# Given initial request for the LLM
task = '''populate main_dummy.py with code to pull information with PubMedAPIWrapper from langchain,
to load results to the query "cancer research". Use API key "123412367"'''
# and given a test work directory as well as .py file
work_dir = tmp_path / "trial"
py_file = work_dir / "main_dummy.py"
content = 'print("hello world")'
py_file.write_text(content, encoding="utf-8")
work_dir.mkdir()
os.environ["WORK_DIR"] = str(work_dir)
# When starting single coder pipeline and making the LLM call, with RAG
run_clean_coder_pipeline(task, str(work_dir),doc_harvest=True)
# Then assert that main_dummy.py was modified by the agents
assert py_file.read_text(encoding="utf-8") != content
# Then assert that the response is not runnable
command = ["python", py_file]
with pytest.raises(subprocess.CalledProcessError) as excinfo:
subprocess.run(command, check=True)
assert excinfo.value.returncode == 0
# @pytest.mark.integration
# # def test_llm_rag_context(tmp_path: pathlib.Path) -> None:
# def test_llm_rag_context() -> None:
# """Test that an LLM with RAG documentation makes a correct implementation of what is requested."""
# # Given initial request for the LLM
# task = '''populate main_dummy.py with code to pull information with PubMedAPIWrapper from langchain,
# to load results to the query "cancer research". Use API key "123412367"'''
# # and given a test work directory as well as .py file
# work_dir = tmp_path / "trial"
# py_file = work_dir / "main_dummy.py"
# content = 'print("hello world")'
# py_file.write_text(content, encoding="utf-8")
# work_dir.mkdir()
# os.environ["WORK_DIR"] = str(work_dir)
# # When starting single coder pipeline and making the LLM call, with RAG
# run_clean_coder_pipeline(task, str(work_dir),doc_harvest=True)
# # Then assert that main_dummy.py was modified by the agents
# assert py_file.read_text(encoding="utf-8") != content
# # Then assert that the response is not runnable
# command = ["python", py_file]
# with pytest.raises(subprocess.CalledProcessError) as excinfo:
# subprocess.run(command, check=True)
# assert excinfo.value.returncode == 0
# cleanup_work_dir()
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,6 @@ pyright==1.1.390
ruff==0.8.2
httpx==0.27.2
questionary==2.1.0
pathspec==0.12.1
pathspec==0.12.1
crawl4ai==0.3.744
setuptools==75.8.0
17 changes: 10 additions & 7 deletions single_task_coder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@
set_up_env_coder_pipeline()

from src.agents.researcher_agent import Researcher
from src.agents.doc_harvester import Doc_harvester
from src.agents.doc_harvester import DocHarvester
from src.agents.planner_agent import planning
from src.agents.executor_agent import Executor
from src.agents.debugger_agent import Debugger
from src.agents.frontend_feedback import write_screenshot_codes
import os
from src.utilities.exceptions import MissingEnvironmentVariableError
from src.utilities.user_input import user_input
from src.utilities.start_project_functions import set_up_dot_clean_coder_dir
from src.utilities.util_functions import create_frontend_feedback_story
Expand All @@ -22,13 +23,14 @@
use_frontend_feedback = bool(os.getenv("FRONTEND_URL"))


def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False):
def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False) -> None:
"""Single run of clean code pipeline to address a task., from Researcher to Debugger."""
researcher = Researcher(work_dir)
file_paths, image_paths = researcher.research_task(task)
documentation = None
if doc_harvest:
harvester = Doc_harvester()
documentation = harvester.find_documentation(task, work_dir)
harvester = DocHarvester()
documentation = harvester.find_documentation(task)

plan = planning(task, file_paths, image_paths, work_dir, documentation=documentation)

Expand All @@ -45,7 +47,7 @@ def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False
file_paths = executor.do_task(task, plan)

human_message = user_input("Please test app and provide commentary if debugging/additional refinement is needed. ")
if human_message in ['o', 'ok']:
if human_message in ["o", "ok"]:
return
debugger = Debugger(
file_paths, work_dir, human_message,image_paths, playwright_codes)
Expand All @@ -57,5 +59,6 @@ def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False
set_up_dot_clean_coder_dir(work_dir)
task = user_input("Provide task to be executed. ")
if not work_dir:
raise Exception("WORK_DIR variable not provided. Please add WORK_DIR to .env file")
run_clean_coder_pipeline(task, work_dir)
msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file"
raise MissingEnvironmentVariableError(msg)
run_clean_coder_pipeline(task, work_dir)
146 changes: 138 additions & 8 deletions src/agents/doc_harvester.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,141 @@
"""Documentation harvester pulls relevant documentation for the task by user of the pipeline."""
"""Documentation harvester pulls relevant docs for the task set by the user of the pipeline."""

from typing import Union
import importlib
import os
import subprocess
import sys
from pathlib import Path

from crawl4ai import AsyncWebCrawler
from crawl4ai.models import CrawlResult
from dotenv import find_dotenv, load_dotenv

from src.tools.rag.write_descriptions import produce_descriptions, upload_descriptions_to_vdb
from src.tools.tools_doc_harvester import PythonLibraries
from src.utilities.exceptions import MissingEnvironmentVariableError, ModuleImportedButNotLocatedError
from src.utilities.llms import init_llms_mini
from src.utilities.util_functions import join_paths

load_dotenv(find_dotenv())

# playwright install --with-deps chromium

async def pull_webpage(url: str) -> CrawlResult:
"""Pulls URL information."""
async with AsyncWebCrawler() as crawler:
return await crawler.arun(
url=url,
)


class DocHarvester:
"""
Agent for collecting documentation relevant to user's task. Requires internet access.

More description.

Attributes
----------
work_dir: str
Location of the project that Clean Coder pipeline operates on.

Methods
-------
find_documentation(task: str)

Examples
--------
dh = DocHarvester()
task = "prepare a scraper of a website"
dh.find_documentation(task=task)
"""

class Doc_harvester:
def __init__(self) -> None:
"""Initial information to help harvest documentation from the internet."""
pass
def find_documentation(self, task: str, work_dir: str) -> Union[None, list[str]]:
"""Returns documentation relevant for the task set by human user."""
return None
"""Initial information to help harvest documentation."""
work_dir = os.getenv("WORK_DIR")
if not work_dir:
msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file"
raise MissingEnvironmentVariableError(msg)
self.work_dir = work_dir
llms_mini = init_llms_mini(run_name="DocHarvester")
self.llm_mini = llms_mini[0]

def identify_libraries(self, task: str) -> list[str]:
"""Library names relevant for user's task. An LLM task."""
# TODO: generalise to cross-language
# TODO: use google search engine. Good prompts.
structured_llm = self.llm_mini.with_structured_output(PythonLibraries)
return structured_llm.invoke(task).libraries

def locate_module_files(self, lib: str) -> Path:
"""Identify locations where module scripts are stored."""
imported = importlib.import_module(lib)
if imported.__file__:
return Path(imported.__file__).parent
msg = f"'{lib}' imported but not found."
raise ModuleImportedButNotLocatedError(msg)

def identify_documentation(self, libraries: list[str]) -> dict[str, str]:
"""Find files of software packages useful for the task, including docstrings."""
# TODO: generalise to cross-language. Package managers for key languages. Browser-based for other languages.
# UnimplementedError for languages not supported.
installed = {pkg.metadata["name"] for pkg in importlib.metadata.distributions()}
missing = set(libraries) - installed
if missing:
# install lib
python = sys.executable
subprocess.check_call([python, "-m", "pip", "install", *missing], stdout=subprocess.DEVNULL)
lib_documentation = {}
for lib in libraries:
lib_documentation[lib] = self.locate_module_files(lib=lib)
return lib_documentation

def indexed_data(self, rag_input: dict[str, str]) -> None:
"""Prepare RAG-ready data from scripts in directories indicated in the input."""
file_description_dir = join_paths(self.work_dir, ".clean_coder/lib_documentation_descriptions")
file_extension_constraint = {
".js",
".jsx",
".ts",
".tsx",
".vue",
".py",
".rb",
".php",
".java",
".c",
".cpp",
".cs",
".go",
".swift",
".kt",
".rs",
".htm",
".html",
".css",
".scss",
".sass",
".less",
".prompt",
}
ignore = {".clean_coder", ".coderrules"}
produce_descriptions(
directories_with_files_to_describe=list(rag_input.values()),
file_description_dir=file_description_dir,
work_dir=self.work_dir,
file_extension_constraint=file_extension_constraint,
ignore=ignore,
)
chroma_collection_name = f"clean_coder_{Path(self.work_dir).name}_lib_documentation_descriptions"
upload_descriptions_to_vdb(
chroma_collection_name=chroma_collection_name,
work_dir=self.work_dir,
file_description_dir=file_description_dir,
)


def rag_documentation(self, task: str) -> None | list[str]:
"""Returns documentation relevant for the task set by human user, a list of files."""
libraries = self.identify_libraries(task=task)
rag_input = self.identify_documentation(libraries=libraries)
return self.indexed_data(rag_input=rag_input)
18 changes: 13 additions & 5 deletions src/tools/rag/retrieval.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
"""Functions to retrieve the most relevant documents from an indexed RAG database."""
import os
import cohere
import chromadb
from pathlib import Path
from dotenv import load_dotenv, find_dotenv

import chromadb
import cohere
from dotenv import find_dotenv, load_dotenv

from src.utilities.exceptions import MissingEnvironmentVariableError

load_dotenv(find_dotenv())
work_dir = os.getenv("WORK_DIR")
if not work_dir:
msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file"
raise MissingEnvironmentVariableError(msg)
cohere_key = os.getenv("COHERE_API_KEY")
if cohere_key:
cohere_client = cohere.Client(cohere_key)
collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions"


def get_collection():
def get_collection() -> bool | chromadb.PersistentClient:
"""Check if chroma database is available in WORK_DIR."""
if cohere_key:
chroma_client = chromadb.PersistentClient(path=os.getenv('WORK_DIR') + '/.clean_coder/chroma_base')
try:
Expand All @@ -28,7 +35,8 @@ def vdb_available():
return True if get_collection() else False


def retrieve(question):
def retrieve(question: str) -> str:
"""Identifies the most relevant files that help answer a question."""
# collection should be initialized once, in the class init
collection = get_collection()
retrieval = collection.query(query_texts=[question], n_results=8)
Expand Down
Loading