diff --git a/non_src/tests/integration_tests/test_llm_in_context.py b/non_src/tests/integration_tests/test_llm_in_context.py index fc5e8910..9cbe5494 100644 --- a/non_src/tests/integration_tests/test_llm_in_context.py +++ b/non_src/tests/integration_tests/test_llm_in_context.py @@ -8,6 +8,7 @@ import pytest from single_task_coder import run_clean_coder_pipeline +from tests.manual_tests.utils_for_tests import cleanup_work_dir, setup_work_dir logger = logging.getLogger() logger.level = logging.INFO @@ -15,16 +16,20 @@ @pytest.mark.integration def test_llm_no_context(tmp_path: pathlib.Path) -> None: +# def test_llm_no_context() -> None: """Test that the LLM hallucinates and produces incorrect import statement without documentation context.""" # Given the task for the LLM task = '''populate main_dummy.py with code to pull information with PubMedAPIWrapper from langchain, to load results to the query "cancer research". Use API key "123412367"''' # and given a test work directory as well as .py file + # folder_with_project_files = "test_llm_no_context" + # setup_work_dir(folder_with_project_files) work_dir = tmp_path / "trial" + work_dir.mkdir() py_file = work_dir / "main_dummy.py" content = 'print("hello world")' py_file.write_text(content, encoding="utf-8") - work_dir.mkdir() + os.environ["WORK_DIR"] = str(work_dir) # When starting single coder pipeline and making the LLM call run_clean_coder_pipeline(task, str(work_dir)) @@ -35,27 +40,30 @@ def test_llm_no_context(tmp_path: pathlib.Path) -> None: with pytest.raises(subprocess.CalledProcessError) as excinfo: subprocess.run(command, check=True) assert excinfo.value.returncode != 0 + cleanup_work_dir() -@pytest.mark.integration -def test_llm_rag_context(tmp_path: pathlib.Path) -> None: - """Test that an LLM with RAG documentation makes a correct implementation of what is requested.""" - # Given initial request for the LLM - task = '''populate main_dummy.py with code to pull information with PubMedAPIWrapper from langchain, - to load results to the query "cancer research". Use API key "123412367"''' - # and given a test work directory as well as .py file - work_dir = tmp_path / "trial" - py_file = work_dir / "main_dummy.py" - content = 'print("hello world")' - py_file.write_text(content, encoding="utf-8") - work_dir.mkdir() - os.environ["WORK_DIR"] = str(work_dir) - # When starting single coder pipeline and making the LLM call, with RAG - run_clean_coder_pipeline(task, str(work_dir),doc_harvest=True) - # Then assert that main_dummy.py was modified by the agents - assert py_file.read_text(encoding="utf-8") != content - # Then assert that the response is not runnable - command = ["python", py_file] - with pytest.raises(subprocess.CalledProcessError) as excinfo: - subprocess.run(command, check=True) - assert excinfo.value.returncode == 0 +# @pytest.mark.integration +# # def test_llm_rag_context(tmp_path: pathlib.Path) -> None: +# def test_llm_rag_context() -> None: +# """Test that an LLM with RAG documentation makes a correct implementation of what is requested.""" +# # Given initial request for the LLM +# task = '''populate main_dummy.py with code to pull information with PubMedAPIWrapper from langchain, +# to load results to the query "cancer research". Use API key "123412367"''' +# # and given a test work directory as well as .py file +# work_dir = tmp_path / "trial" +# py_file = work_dir / "main_dummy.py" +# content = 'print("hello world")' +# py_file.write_text(content, encoding="utf-8") +# work_dir.mkdir() +# os.environ["WORK_DIR"] = str(work_dir) +# # When starting single coder pipeline and making the LLM call, with RAG +# run_clean_coder_pipeline(task, str(work_dir),doc_harvest=True) +# # Then assert that main_dummy.py was modified by the agents +# assert py_file.read_text(encoding="utf-8") != content +# # Then assert that the response is not runnable +# command = ["python", py_file] +# with pytest.raises(subprocess.CalledProcessError) as excinfo: +# subprocess.run(command, check=True) +# assert excinfo.value.returncode == 0 +# cleanup_work_dir() diff --git a/requirements.txt b/requirements.txt index e0e73e24..b6cc1bb1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,4 +30,6 @@ pyright==1.1.390 ruff==0.8.2 httpx==0.27.2 questionary==2.1.0 -pathspec==0.12.1 \ No newline at end of file +pathspec==0.12.1 +crawl4ai==0.3.744 +setuptools==75.8.0 \ No newline at end of file diff --git a/single_task_coder.py b/single_task_coder.py index 6cfc59bd..dd938347 100644 --- a/single_task_coder.py +++ b/single_task_coder.py @@ -7,12 +7,13 @@ set_up_env_coder_pipeline() from src.agents.researcher_agent import Researcher -from src.agents.doc_harvester import Doc_harvester +from src.agents.doc_harvester import DocHarvester from src.agents.planner_agent import planning from src.agents.executor_agent import Executor from src.agents.debugger_agent import Debugger from src.agents.frontend_feedback import write_screenshot_codes import os +from src.utilities.exceptions import MissingEnvironmentVariableError from src.utilities.user_input import user_input from src.utilities.start_project_functions import set_up_dot_clean_coder_dir from src.utilities.util_functions import create_frontend_feedback_story @@ -22,13 +23,14 @@ use_frontend_feedback = bool(os.getenv("FRONTEND_URL")) -def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False): +def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False) -> None: + """Single run of clean code pipeline to address a task., from Researcher to Debugger.""" researcher = Researcher(work_dir) file_paths, image_paths = researcher.research_task(task) documentation = None if doc_harvest: - harvester = Doc_harvester() - documentation = harvester.find_documentation(task, work_dir) + harvester = DocHarvester() + documentation = harvester.find_documentation(task) plan = planning(task, file_paths, image_paths, work_dir, documentation=documentation) @@ -45,7 +47,7 @@ def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False file_paths = executor.do_task(task, plan) human_message = user_input("Please test app and provide commentary if debugging/additional refinement is needed. ") - if human_message in ['o', 'ok']: + if human_message in ["o", "ok"]: return debugger = Debugger( file_paths, work_dir, human_message,image_paths, playwright_codes) @@ -57,5 +59,6 @@ def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False set_up_dot_clean_coder_dir(work_dir) task = user_input("Provide task to be executed. ") if not work_dir: - raise Exception("WORK_DIR variable not provided. Please add WORK_DIR to .env file") - run_clean_coder_pipeline(task, work_dir) \ No newline at end of file + msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" + raise MissingEnvironmentVariableError(msg) + run_clean_coder_pipeline(task, work_dir) diff --git a/src/agents/doc_harvester.py b/src/agents/doc_harvester.py index 3579e62d..4f8b21f4 100644 --- a/src/agents/doc_harvester.py +++ b/src/agents/doc_harvester.py @@ -1,11 +1,141 @@ -"""Documentation harvester pulls relevant documentation for the task by user of the pipeline.""" +"""Documentation harvester pulls relevant docs for the task set by the user of the pipeline.""" -from typing import Union +import importlib +import os +import subprocess +import sys +from pathlib import Path + +from crawl4ai import AsyncWebCrawler +from crawl4ai.models import CrawlResult +from dotenv import find_dotenv, load_dotenv + +from src.tools.rag.write_descriptions import produce_descriptions, upload_descriptions_to_vdb +from src.tools.tools_doc_harvester import PythonLibraries +from src.utilities.exceptions import MissingEnvironmentVariableError, ModuleImportedButNotLocatedError +from src.utilities.llms import init_llms_mini +from src.utilities.util_functions import join_paths + +load_dotenv(find_dotenv()) + +# playwright install --with-deps chromium + +async def pull_webpage(url: str) -> CrawlResult: + """Pulls URL information.""" + async with AsyncWebCrawler() as crawler: + return await crawler.arun( + url=url, + ) + + +class DocHarvester: + """ + Agent for collecting documentation relevant to user's task. Requires internet access. + + More description. + + Attributes + ---------- + work_dir: str + Location of the project that Clean Coder pipeline operates on. + + Methods + ------- + find_documentation(task: str) + + Examples + -------- + dh = DocHarvester() + task = "prepare a scraper of a website" + dh.find_documentation(task=task) + """ -class Doc_harvester: def __init__(self) -> None: - """Initial information to help harvest documentation from the internet.""" - pass - def find_documentation(self, task: str, work_dir: str) -> Union[None, list[str]]: - """Returns documentation relevant for the task set by human user.""" - return None + """Initial information to help harvest documentation.""" + work_dir = os.getenv("WORK_DIR") + if not work_dir: + msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" + raise MissingEnvironmentVariableError(msg) + self.work_dir = work_dir + llms_mini = init_llms_mini(run_name="DocHarvester") + self.llm_mini = llms_mini[0] + + def identify_libraries(self, task: str) -> list[str]: + """Library names relevant for user's task. An LLM task.""" + # TODO: generalise to cross-language + # TODO: use google search engine. Good prompts. + structured_llm = self.llm_mini.with_structured_output(PythonLibraries) + return structured_llm.invoke(task).libraries + + def locate_module_files(self, lib: str) -> Path: + """Identify locations where module scripts are stored.""" + imported = importlib.import_module(lib) + if imported.__file__: + return Path(imported.__file__).parent + msg = f"'{lib}' imported but not found." + raise ModuleImportedButNotLocatedError(msg) + + def identify_documentation(self, libraries: list[str]) -> dict[str, str]: + """Find files of software packages useful for the task, including docstrings.""" + # TODO: generalise to cross-language. Package managers for key languages. Browser-based for other languages. + # UnimplementedError for languages not supported. + installed = {pkg.metadata["name"] for pkg in importlib.metadata.distributions()} + missing = set(libraries) - installed + if missing: + # install lib + python = sys.executable + subprocess.check_call([python, "-m", "pip", "install", *missing], stdout=subprocess.DEVNULL) + lib_documentation = {} + for lib in libraries: + lib_documentation[lib] = self.locate_module_files(lib=lib) + return lib_documentation + + def indexed_data(self, rag_input: dict[str, str]) -> None: + """Prepare RAG-ready data from scripts in directories indicated in the input.""" + file_description_dir = join_paths(self.work_dir, ".clean_coder/lib_documentation_descriptions") + file_extension_constraint = { + ".js", + ".jsx", + ".ts", + ".tsx", + ".vue", + ".py", + ".rb", + ".php", + ".java", + ".c", + ".cpp", + ".cs", + ".go", + ".swift", + ".kt", + ".rs", + ".htm", + ".html", + ".css", + ".scss", + ".sass", + ".less", + ".prompt", + } + ignore = {".clean_coder", ".coderrules"} + produce_descriptions( + directories_with_files_to_describe=list(rag_input.values()), + file_description_dir=file_description_dir, + work_dir=self.work_dir, + file_extension_constraint=file_extension_constraint, + ignore=ignore, + ) + chroma_collection_name = f"clean_coder_{Path(self.work_dir).name}_lib_documentation_descriptions" + upload_descriptions_to_vdb( + chroma_collection_name=chroma_collection_name, + work_dir=self.work_dir, + file_description_dir=file_description_dir, + ) + + + def rag_documentation(self, task: str) -> None | list[str]: + """Returns documentation relevant for the task set by human user, a list of files.""" + libraries = self.identify_libraries(task=task) + rag_input = self.identify_documentation(libraries=libraries) + return self.indexed_data(rag_input=rag_input) diff --git a/src/tools/rag/retrieval.py b/src/tools/rag/retrieval.py index 603ca265..71fc7bc6 100644 --- a/src/tools/rag/retrieval.py +++ b/src/tools/rag/retrieval.py @@ -1,19 +1,26 @@ +"""Functions to retrieve the most relevant documents from an indexed RAG database.""" import os -import cohere -import chromadb from pathlib import Path -from dotenv import load_dotenv, find_dotenv +import chromadb +import cohere +from dotenv import find_dotenv, load_dotenv + +from src.utilities.exceptions import MissingEnvironmentVariableError load_dotenv(find_dotenv()) work_dir = os.getenv("WORK_DIR") +if not work_dir: + msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" + raise MissingEnvironmentVariableError(msg) cohere_key = os.getenv("COHERE_API_KEY") if cohere_key: cohere_client = cohere.Client(cohere_key) collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" -def get_collection(): +def get_collection() -> bool | chromadb.PersistentClient: + """Check if chroma database is available in WORK_DIR.""" if cohere_key: chroma_client = chromadb.PersistentClient(path=os.getenv('WORK_DIR') + '/.clean_coder/chroma_base') try: @@ -28,7 +35,8 @@ def vdb_available(): return True if get_collection() else False -def retrieve(question): +def retrieve(question: str) -> str: + """Identifies the most relevant files that help answer a question.""" # collection should be initialized once, in the class init collection = get_collection() retrieval = collection.query(query_texts=[question], n_results=8) diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index 4518593e..22a5e3ad 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -1,123 +1,265 @@ +"""Functions to create an index of files for RAG.""" + +import logging import os +import sys from pathlib import Path + +import chromadb +from dotenv import find_dotenv, load_dotenv from langchain.prompts import ChatPromptTemplate from langchain_core.output_parsers import StrOutputParser -from dotenv import load_dotenv, find_dotenv -import chromadb -import sys -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))) -from src.utilities.util_functions import join_paths, read_coderrules -from src.utilities.start_work_functions import CoderIgnore, file_folder_ignored -from src.utilities.llms import init_llms_mini +from langchain_core.runnables.base import RunnableSequence +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))) +from src.utilities.exceptions import MissingEnvironmentVariableError +from src.utilities.llms import init_llms_mini +from src.utilities.start_work_functions import file_folder_ignored +from src.utilities.util_functions import join_paths -load_dotenv(find_dotenv()) -work_dir = os.getenv("WORK_DIR") +## Configure the logging level +logging.basicConfig(level=logging.INFO) -def is_code_file(file_path): +def relevant_extension(file_path: Path, file_extension_constraint: set[str]) -> bool: + """Checker for whether file extension indicates a script.""" # List of common code file extensions - code_extensions = { - '.js', '.jsx', '.ts', '.tsx', '.vue', '.py', '.rb', '.php', '.java', '.c', '.cpp', '.cs', '.go', '.swift', - '.kt', '.rs', '.htm','.html', '.css', '.scss', '.sass', '.less', '.prompt', - } - return file_path.suffix.lower() in code_extensions + return file_path.suffix.lower() in file_extension_constraint # read file content. place name of file in the top -def get_content(file_path): - with open(file_path, 'r', encoding='utf-8') as file: +def get_content(file_path: Path) -> str: + """Collect file name and content to return them together as string.""" + with open(file_path, encoding="utf-8") as file: content = file.read() - content = file_path.name + '\n' + content - return content + return file_path.name + "\n" + content -def collect_file_pathes(subfolders, work_dir): - """ - Collect and return a list of allowed code files from the given subfolders - under the work_dir according to is_code_file criteria and .coderignore patterns. - """ - allowed_files = [] - for folder in subfolders: - for root, _, files in os.walk(work_dir + folder): - for file in files: - file_path = Path(root) / file - if not is_code_file(file_path): - continue - relative_path_str = file_path.relative_to(work_dir).as_posix() - if file_folder_ignored(relative_path_str): - continue - allowed_files.append(file_path) - return allowed_files +def add_to_indexing_if_relevant(root: str, file: str, file_extension_constraint: set[str] | None) -> Path | None: + """Return file path if the file is to be considered.""" + file_path = Path(root).joinpath(file) + if file_folder_ignored(str(file_path)): + # ignore files and folders mentioned in .coderignore + return None + if not file_extension_constraint: + return file_path + if relevant_extension( + file_path, file_extension_constraint=file_extension_constraint, + ): + return file_path + return None -def write_descriptions(subfolders_with_files=['/']): - all_files = collect_file_pathes(subfolders_with_files, work_dir) - coderrules = read_coderrules() - - prompt = ChatPromptTemplate.from_template( -f"""First, get known with info about project (may be useful, may be not): - -''' -{coderrules} -''' +def files_in_directory( + directories_with_files_to_describe: list[str | Path], + file_extension_constraint: set[str] | None, +) -> list[Path]: + """Fetch paths of files in directory.""" + files_to_describe = [] + for directory in directories_with_files_to_describe: + directory_files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))] + tmp = [ + add_to_indexing_if_relevant( + root=str(directory), + file=file, + file_extension_constraint=file_extension_constraint, + ) + for file in directory_files + ] + files_to_describe.extend(tmp) + for root, _, files in os.walk(directory): + tmp = [ + add_to_indexing_if_relevant( + root=root, + file=file, + file_extension_constraint=file_extension_constraint, + ) + for file in files + ] + files_to_describe.extend(tmp) + return files_to_describe -Describe the code in 4 sentences or less, focusing only on important information from integration point of view. -Write what file is responsible for. -Go traight to the thing in description, without starting sentence. +def save_file_description(file_path: Path, description: str, file_description_dir: str) -> None: + """Save file description.""" + if not work_dir: + msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" + raise MissingEnvironmentVariableError(msg) + file_name = file_path.relative_to(work_dir).as_posix().replace("/", "=") + output_path = join_paths(file_description_dir, f"{file_name}.txt") + with open(output_path, "w", encoding="utf-8") as out_file: + out_file.write(description) -''' -{{code}} -''' -""" - ) - llms = init_llms_mini(tools=[], run_name='File Describer') - llm = llms[0] - chain = prompt | llm | StrOutputParser() - description_folder = join_paths(work_dir, '.clean_coder/files_and_folders_descriptions') - Path(description_folder).mkdir(parents=True, exist_ok=True) +def output_descriptions( + files_to_describe: list[Path], chain: RunnableSequence, file_description_dir: str, +) -> None: + """Generate & output file descriptions to designated directory in WORK_DIR.""" # iterate over all files, take 8 files at once batch_size = 8 - for i in range(0, len(all_files), batch_size): - files_iteration = all_files[i:i + batch_size] + for i in range(0, len(files_to_describe), batch_size): + files_iteration = [f for f in files_to_describe[i : i + batch_size] if f is not None] descriptions = chain.batch([get_content(file_path) for file_path in files_iteration]) - print(descriptions) + logging.debug(descriptions) + [ + save_file_description( + file_path=file_path, + description=description, + file_description_dir=file_description_dir, + ) + for file_path, description in zip(files_iteration, descriptions, strict=True) + ] - for file_path, description in zip(files_iteration, descriptions): - file_name = file_path.relative_to(work_dir).as_posix().replace('/', '=') - output_path = join_paths(description_folder, f"{file_name}.txt") - with open(output_path, 'w', encoding='utf-8') as out_file: - out_file.write(description) +def produce_descriptions( + directories_with_files_to_describe: list[str | Path], + file_description_dir: str, + file_extension_constraint: set[str] | None = None, +) -> None: + """ + Produce short descriptions of files. Store the descriptions in .clean_coder folder in WORK_DIR. + + Inputs: + directories_with_files_to_describe: directories from which files are to be described. + file_description_dir: directory where generated file descriptions are to be saved to. + ignore: files and folders to ignore. + file_extension_constraint: The list of file extension types accepted, if it's provided. + Example: + work_dir = os.getenv("WORK_DIR") # provide your own directory of choice if WORK_DIR is not set. + if not work_dir: + msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" + raise MissingEnvironmentVariableError(msg) + file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions") + file_extension_constraint = { + ".js", ".jsx", ".ts", ".tsx", ".vue", ".py", ".rb", ".php", ".java", ".c", ".cpp", ".cs", ".go", ".swift", + ".kt", ".rs", ".htm",".html", ".css", ".scss", ".sass", ".less", ".prompt", + } + ignore = {".clean_coder", ".coderrules"} + produce_descriptions(directories_with_files_to_describe=[work_dir], + file_description_dir=file_description_dir, + file_extension_constraint=file_extension_constraint, + ) + """ + files_to_describe = files_in_directory( + directories_with_files_to_describe=directories_with_files_to_describe, + file_extension_constraint=file_extension_constraint, + ) -def upload_descriptions_to_vdb(): - chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, '.clean_coder/chroma_base')) - collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" + prompt = ChatPromptTemplate.from_template( + """Describe the following code in 4 sentences or less, focusing only on important information from integration point of view. + Write what file is responsible for.\n\n'''\n{code}''' + """, + ) - collection = chroma_client.get_or_create_collection( - name=collection_name + llms = init_llms_mini(tools=[], run_name="File Describer") + llm = llms[0] + chain = prompt | llm | StrOutputParser() + Path(file_description_dir).mkdir(parents=True, exist_ok=True) + output_descriptions( + files_to_describe=files_to_describe, chain=chain, file_description_dir=file_description_dir ) - # read files and upload to base - description_folder = join_paths(work_dir, '.clean_coder/files_and_folders_descriptions') - for root, _, files in os.walk(description_folder): + +def upload_to_collection(collection: chromadb.PersistentClient, file_description_dir: str) -> None: + """Insert file information to chroma database.""" + for root, _, files in os.walk(file_description_dir): for file in files: file_path = Path(root) / file - with open(file_path, 'r', encoding='utf-8') as file: - content = file.read() + with open(file_path, encoding="utf-8") as f: + content = f.read() collection.upsert( documents=[ - content + content, ], - ids=[file_path.name.replace('=', '/').removesuffix(".txt")], + ids=[file_path.name.replace("=", "/").removesuffix(".txt")], ) -if __name__ == '__main__': - #provide optionally which subfolders needs to be checked, if you don't want to describe all project folder - write_descriptions(subfolders_with_files=['/']) +def upload_descriptions_to_vdb( + chroma_collection_name: str, + file_description_dir: str, + vdb_location: str = ".clean_coder/chroma_base", +) -> None: + """ + Upload file descriptions to chroma database. + + Inputs: + chroma_collection_name: name of the collection within Chroma vector database to save file descriptions in. + file_description_dir: directory where generated file descriptions are available. + vdb_location: (optional) location for storing the vector database. - upload_descriptions_to_vdb() + Example: + work_dir = os.getenv("WORK_DIR") # provide your own directory of choice if WORK_DIR is not set. + if not work_dir: + msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" + raise MissingEnvironmentVariableError(msg) + file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions") + file_extension_constraint = { + ".js", ".jsx", ".ts", ".tsx", ".vue", ".py", ".rb", ".php", ".java", ".c", ".cpp", ".cs", ".go", ".swift", + ".kt", ".rs", ".htm",".html", ".css", ".scss", ".sass", ".less", ".prompt", + } + ignore = {".clean_coder", ".coderrules"} + produce_descriptions(directories_with_files_to_describe=[work_dir], + file_description_dir=file_description_dir, + file_extension_constraint=file_extension_constraint, + ) + chroma_collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" + upload_descriptions_to_vdb(chroma_collection_name=chroma_collection_name, file_description_dir=file_description_dir) + """ + work_dir = os.getenv("WORK_DIR") + if not work_dir: + msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" + raise MissingEnvironmentVariableError(msg) + chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, vdb_location)) + collection = chroma_client.get_or_create_collection( + name=chroma_collection_name, + ) + + # read files and upload to base + upload_to_collection(collection=collection, file_description_dir=file_description_dir) + + +if __name__ == "__main__": + # provide optionally which subfolders needs to be checked, if you don't want to describe all project folder + # load environment + load_dotenv(find_dotenv()) + work_dir = os.getenv("WORK_DIR") + if not work_dir: + msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" + raise MissingEnvironmentVariableError(msg) + file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions") + file_extension_constraint = { + ".js", + ".jsx", + ".ts", + ".tsx", + ".vue", + ".py", + ".rb", + ".php", + ".java", + ".c", + ".cpp", + ".cs", + ".go", + ".swift", + ".kt", + ".rs", + ".htm", + ".html", + ".css", + ".scss", + ".sass", + ".less", + ".prompt", + } + produce_descriptions( + directories_with_files_to_describe=[work_dir], + file_description_dir=file_description_dir, + file_extension_constraint=file_extension_constraint, + ) + chroma_collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" + upload_descriptions_to_vdb( + chroma_collection_name=chroma_collection_name, file_description_dir=file_description_dir, + ) diff --git a/src/tools/tools_doc_harvester.py b/src/tools/tools_doc_harvester.py new file mode 100644 index 00000000..234bfb1b --- /dev/null +++ b/src/tools/tools_doc_harvester.py @@ -0,0 +1,16 @@ +"""Tolls for doc harvester.""" +from pydantic import BaseModel, Field + + +class PythonLibraries(BaseModel): + """Identify python libraries which are relevant to the user's task. No fewer than three.""" + + libraries: list[str] = Field("The list of libraries.") + + + +if __name__ == "__main__": + from langchain_openai import ChatOpenAI + llm = ChatOpenAI(model="gpt-4o-mini") + structured_llm = llm.with_structured_output(PythonLibraries) + structured_llm.invoke("I want to create a website scraper for abcnews.com. I want to use playwright for the scraping") diff --git a/src/utilities/exceptions.py b/src/utilities/exceptions.py new file mode 100644 index 00000000..b9658d7f --- /dev/null +++ b/src/utilities/exceptions.py @@ -0,0 +1,7 @@ +"""Custom esception messages.""" + +class MissingEnvironmentVariableError(Exception): + """Enviromental variable missing.""" + +class ModuleImportedButNotLocatedError(Exception): + """A module was imported but the location of its source files is unknown.""" diff --git a/src/utilities/llms.py b/src/utilities/llms.py index 3885dcc9..b9fa687c 100644 --- a/src/utilities/llms.py +++ b/src/utilities/llms.py @@ -1,11 +1,15 @@ -from langchain_openai.chat_models import ChatOpenAI as ChatOpenRouter -from langchain_openai.chat_models import ChatOpenAI as ChatLocalModel -from os import getenv +"""Utilities for loading LLMs.""" import os +from collections.abc import Callable +from os import getenv + from dotenv import load_dotenv -from langchain_openai.chat_models import ChatOpenAI from langchain_anthropic import ChatAnthropic from langchain_ollama import ChatOllama +from langchain_openai.chat_models import ChatOpenAI +from langchain_openai.chat_models import ChatOpenAI as ChatLocalModel +from langchain_openai.chat_models import ChatOpenAI as ChatOpenRouter + #from langchain_google_genai import ChatGoogleGenerativeAI load_dotenv() @@ -23,7 +27,8 @@ def llm_open_router(model): timeout=60, ) -def llm_open_local_hosted(model): +def llm_open_local_hosted(model: str) -> Callable: + """Return a locally hosted model.""" return ChatLocalModel( openai_api_key="n/a", openai_api_base=getenv("LOCAL_MODEL_API_BASE"), @@ -31,10 +36,20 @@ def llm_open_local_hosted(model): timeout=90, ) -def init_llms(tools=None, run_name="Clean Coder", temp=0): +def llms_with_tools_and_config(llms: list[Callable], tools: list[Callable] | None, run_name: str) -> list[Callable]: + """Adds tools and config to loaded llms.""" + for i, llm in enumerate(llms): + if tools: + llm = llm.bind_tools(tools) + llms[i] = llm.with_config({"run_name": run_name}) + return llms + + +def init_llms(tools: None | list[Callable] =None, run_name: str = "Clean Coder", temp: float = 0) -> list[Callable]: + """Returns available mid-sized LLM models, with tools when available and config.""" llms = [] if getenv("ANTHROPIC_API_KEY"): - llms.append(ChatAnthropic(model='claude-3-5-sonnet-20241022', temperature=temp, timeout=60, max_tokens=2048)) + llms.append(ChatAnthropic(model="claude-3-5-sonnet-20241022", temperature=temp, timeout=60, max_tokens=2048)) if getenv("OPENROUTER_API_KEY"): llms.append(llm_open_router("anthropic/claude-3.5-sonnet")) if getenv("OPENAI_API_KEY"): @@ -43,19 +58,16 @@ def init_llms(tools=None, run_name="Clean Coder", temp=0): # llms.append(ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", temperature=temp, timeout=60)) if getenv("OLLAMA_MODEL"): llms.append(ChatOllama(model=os.getenv("OLLAMA_MODEL"))) - if getenv("LOCAL_MODEL_API_BASE"): + if getenv("LOCAL_MODEL_API_BASE") and getenv("LOCAL_MODEL_NAME"): llms.append(llm_open_local_hosted(getenv("LOCAL_MODEL_NAME"))) - for i, llm in enumerate(llms): - if tools: - llm = llm.bind_tools(tools) - llms[i] = llm.with_config({"run_name": run_name}) - return llms + return llms_with_tools_and_config(llms=llms, tools=tools, run_name=run_name) -def init_llms_mini(tools=None, run_name="Clean Coder", temp=0): +def init_llms_mini(tools: None | list[Callable] =None, run_name: str = "Clean Coder", temp: float=0) -> list[Callable]: + """Returns available small LLM models, with tools when available and config.""" llms = [] if os.getenv("ANTHROPIC_API_KEY"): - llms.append(ChatAnthropic(model='claude-3-5-haiku-20241022', temperature=temp, timeout=60)) + llms.append(ChatAnthropic(model="claude-3-5-haiku-20241022", temperature=temp, timeout=60)) if os.getenv("OPENROUTER_API_KEY"): llms.append(llm_open_router("anthropic/claude-3.5-haiku")) if os.getenv("OPENAI_API_KEY"): @@ -64,16 +76,13 @@ def init_llms_mini(tools=None, run_name="Clean Coder", temp=0): # llms.append(ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", temperature=temp, timeout=60)) if os.getenv("OLLAMA_MODEL"): llms.append(ChatOllama(model=os.getenv("OLLAMA_MODEL"))) - if getenv("LOCAL_MODEL_API_BASE"): + if getenv("LOCAL_MODEL_API_BASE") and getenv("LOCAL_MODEL_NAME"): llms.append(llm_open_local_hosted(getenv("LOCAL_MODEL_NAME"))) - for i, llm in enumerate(llms): - if tools: - llm = llm.bind_tools(tools) - llms[i] = llm.with_config({"run_name": run_name}) - return llms + return llms_with_tools_and_config(llms=llms, tools=tools, run_name=run_name) -def init_llms_high_intelligence(tools=None, run_name="Clean Coder", temp=0.2): +def init_llms_high_intelligence(tools: None | list[Callable] =None, run_name: str = "Clean Coder", temp: float=0.2) -> list[Callable]: + """Returns available high power LLM models, with tools when available and config.""" llms = [] if os.getenv("OPENAI_API_KEY"): llms.append(ChatOpenAI(model="o3-mini", temperature=1, timeout=60, reasoning_effort="high")) @@ -89,10 +98,6 @@ def init_llms_high_intelligence(tools=None, run_name="Clean Coder", temp=0.2): # llms.append(ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", temperature=temp, timeout=60)) if os.getenv("OLLAMA_MODEL"): llms.append(ChatOllama(model=os.getenv("OLLAMA_MODEL"))) - if getenv("LOCAL_MODEL_API_BASE"): + if getenv("LOCAL_MODEL_API_BASE") and getenv("LOCAL_MODEL_NAME"): llms.append(llm_open_local_hosted(getenv("LOCAL_MODEL_NAME"))) - for i, llm in enumerate(llms): - if tools: - llm = llm.bind_tools(tools) - llms[i] = llm.with_config({"run_name": run_name}) - return llms \ No newline at end of file + return llms_with_tools_and_config(llms=llms, tools=tools, run_name=run_name)