diff --git a/non_src/tests/integration_tests/test_llm_in_context.py b/non_src/tests/integration_tests/test_llm_in_context.py
index fc5e8910..9cbe5494 100644
--- a/non_src/tests/integration_tests/test_llm_in_context.py
+++ b/non_src/tests/integration_tests/test_llm_in_context.py
@@ -8,6 +8,7 @@
 import pytest
 
 from single_task_coder import run_clean_coder_pipeline
+from tests.manual_tests.utils_for_tests import cleanup_work_dir, setup_work_dir
 
 logger = logging.getLogger()
 logger.level = logging.INFO
@@ -15,16 +16,20 @@
 
 @pytest.mark.integration
 def test_llm_no_context(tmp_path: pathlib.Path) -> None:
+# def test_llm_no_context() -> None:
     """Test that the LLM hallucinates and produces incorrect import statement without documentation context."""
     # Given the task for the LLM
     task = '''populate main_dummy.py with code to pull information with PubMedAPIWrapper from langchain,
         to load results to the query "cancer research". Use API key "123412367"'''
     # and given a test work directory as well as .py file
+    # folder_with_project_files = "test_llm_no_context"
+    # setup_work_dir(folder_with_project_files)
     work_dir = tmp_path / "trial"
+    work_dir.mkdir()
     py_file = work_dir / "main_dummy.py"
     content = 'print("hello world")'
     py_file.write_text(content, encoding="utf-8")
-    work_dir.mkdir()
+    
     os.environ["WORK_DIR"] = str(work_dir)
     # When starting single coder pipeline and making the LLM call
     run_clean_coder_pipeline(task, str(work_dir))
@@ -35,27 +40,30 @@ def test_llm_no_context(tmp_path: pathlib.Path) -> None:
     with pytest.raises(subprocess.CalledProcessError) as excinfo:
         subprocess.run(command, check=True)
     assert excinfo.value.returncode != 0
+    cleanup_work_dir()
 
 
-@pytest.mark.integration
-def test_llm_rag_context(tmp_path: pathlib.Path) -> None:
-    """Test that an LLM with RAG documentation makes a correct implementation of what is requested."""
-    # Given initial request for the LLM
-    task = '''populate main_dummy.py with code to pull information with PubMedAPIWrapper from langchain,
-        to load results to the query "cancer research". Use API key "123412367"'''
-    # and given a test work directory as well as .py file
-    work_dir = tmp_path / "trial"
-    py_file = work_dir / "main_dummy.py"
-    content = 'print("hello world")'
-    py_file.write_text(content, encoding="utf-8")
-    work_dir.mkdir()
-    os.environ["WORK_DIR"] = str(work_dir)
-    # When starting single coder pipeline and making the LLM call, with RAG
-    run_clean_coder_pipeline(task, str(work_dir),doc_harvest=True)
-    # Then assert that main_dummy.py was modified by the agents
-    assert py_file.read_text(encoding="utf-8") != content
-    # Then assert that the response is not runnable
-    command = ["python", py_file]
-    with pytest.raises(subprocess.CalledProcessError) as excinfo:
-        subprocess.run(command, check=True)
-    assert excinfo.value.returncode == 0
+# @pytest.mark.integration
+# # def test_llm_rag_context(tmp_path: pathlib.Path) -> None:
+# def test_llm_rag_context() -> None:
+#     """Test that an LLM with RAG documentation makes a correct implementation of what is requested."""
+#     # Given initial request for the LLM
+#     task = '''populate main_dummy.py with code to pull information with PubMedAPIWrapper from langchain,
+#         to load results to the query "cancer research". Use API key "123412367"'''
+#     # and given a test work directory as well as .py file
+#     work_dir = tmp_path / "trial"
+#     py_file = work_dir / "main_dummy.py"
+#     content = 'print("hello world")'
+#     py_file.write_text(content, encoding="utf-8")
+#     work_dir.mkdir()
+#     os.environ["WORK_DIR"] = str(work_dir)
+#     # When starting single coder pipeline and making the LLM call, with RAG
+#     run_clean_coder_pipeline(task, str(work_dir),doc_harvest=True)
+#     # Then assert that main_dummy.py was modified by the agents
+#     assert py_file.read_text(encoding="utf-8") != content
+#     # Then assert that the response is not runnable
+#     command = ["python", py_file]
+#     with pytest.raises(subprocess.CalledProcessError) as excinfo:
+#         subprocess.run(command, check=True)
+#     assert excinfo.value.returncode == 0
+#     cleanup_work_dir()
diff --git a/requirements.txt b/requirements.txt
index e0e73e24..b6cc1bb1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,4 +30,6 @@ pyright==1.1.390
 ruff==0.8.2
 httpx==0.27.2
 questionary==2.1.0
-pathspec==0.12.1
\ No newline at end of file
+pathspec==0.12.1
+crawl4ai==0.3.744
+setuptools==75.8.0
\ No newline at end of file
diff --git a/single_task_coder.py b/single_task_coder.py
index 6cfc59bd..dd938347 100644
--- a/single_task_coder.py
+++ b/single_task_coder.py
@@ -7,12 +7,13 @@
     set_up_env_coder_pipeline()
 
 from src.agents.researcher_agent import Researcher
-from src.agents.doc_harvester import Doc_harvester
+from src.agents.doc_harvester import DocHarvester
 from src.agents.planner_agent import planning
 from src.agents.executor_agent import Executor
 from src.agents.debugger_agent import Debugger
 from src.agents.frontend_feedback import write_screenshot_codes
 import os
+from src.utilities.exceptions import MissingEnvironmentVariableError
 from src.utilities.user_input import user_input
 from src.utilities.start_project_functions import set_up_dot_clean_coder_dir
 from src.utilities.util_functions import create_frontend_feedback_story
@@ -22,13 +23,14 @@
 use_frontend_feedback = bool(os.getenv("FRONTEND_URL"))
 
 
-def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False):
+def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False) -> None:
+    """Single run of clean code pipeline to address a task., from Researcher to Debugger."""
     researcher = Researcher(work_dir)
     file_paths, image_paths = researcher.research_task(task)
     documentation = None
     if doc_harvest:
-        harvester = Doc_harvester()
-        documentation = harvester.find_documentation(task, work_dir)
+        harvester = DocHarvester()
+        documentation = harvester.find_documentation(task)
 
     plan = planning(task, file_paths, image_paths, work_dir, documentation=documentation)
 
@@ -45,7 +47,7 @@ def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False
         file_paths = executor.do_task(task, plan)
 
     human_message = user_input("Please test app and provide commentary if debugging/additional refinement is needed. ")
-    if human_message in ['o', 'ok']:
+    if human_message in ["o", "ok"]:
         return
     debugger = Debugger(
         file_paths, work_dir, human_message,image_paths,  playwright_codes)
@@ -57,5 +59,6 @@ def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False
     set_up_dot_clean_coder_dir(work_dir)
     task = user_input("Provide task to be executed. ")
     if not work_dir:
-        raise Exception("WORK_DIR variable not provided. Please add WORK_DIR to .env file")
-    run_clean_coder_pipeline(task, work_dir)
\ No newline at end of file
+        msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file"
+        raise MissingEnvironmentVariableError(msg)
+    run_clean_coder_pipeline(task, work_dir)
diff --git a/src/agents/doc_harvester.py b/src/agents/doc_harvester.py
index 3579e62d..4f8b21f4 100644
--- a/src/agents/doc_harvester.py
+++ b/src/agents/doc_harvester.py
@@ -1,11 +1,141 @@
-"""Documentation harvester pulls relevant documentation for the task by user of the pipeline."""
+"""Documentation harvester pulls relevant docs for the task set by the user of the pipeline."""
 
-from typing import Union
+import importlib
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.models import CrawlResult
+from dotenv import find_dotenv, load_dotenv
+
+from src.tools.rag.write_descriptions import produce_descriptions, upload_descriptions_to_vdb
+from src.tools.tools_doc_harvester import PythonLibraries
+from src.utilities.exceptions import MissingEnvironmentVariableError, ModuleImportedButNotLocatedError
+from src.utilities.llms import init_llms_mini
+from src.utilities.util_functions import join_paths
+
+load_dotenv(find_dotenv())
+
+# playwright install --with-deps chromium
+
+async def pull_webpage(url: str) -> CrawlResult:
+    """Pulls URL information."""
+    async with AsyncWebCrawler() as crawler:
+        return await crawler.arun(
+            url=url,
+        )
+
+
+class DocHarvester:
+    """
+    Agent for collecting documentation relevant to user's task. Requires internet access.
+
+    More description.
+
+    Attributes
+    ----------
+    work_dir: str
+        Location of the project that Clean Coder pipeline operates on.
+
+    Methods
+    -------
+        find_documentation(task: str)
+
+    Examples
+    --------
+        dh = DocHarvester()
+        task = "prepare a scraper of a website"
+        dh.find_documentation(task=task)
+    """
 
-class Doc_harvester:
     def __init__(self) -> None:
-        """Initial information to help harvest documentation from the internet."""
-        pass
-    def find_documentation(self, task: str, work_dir: str) -> Union[None, list[str]]:
-        """Returns documentation relevant for the task set by human user."""
-        return None
+        """Initial information to help harvest documentation."""
+        work_dir = os.getenv("WORK_DIR")
+        if not work_dir:
+            msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file"
+            raise MissingEnvironmentVariableError(msg)
+        self.work_dir = work_dir
+        llms_mini = init_llms_mini(run_name="DocHarvester")
+        self.llm_mini = llms_mini[0]
+
+    def identify_libraries(self, task: str) -> list[str]:
+        """Library names relevant for user's task. An LLM task."""
+        # TODO: generalise to cross-language
+        # TODO: use google search engine. Good prompts.
+        structured_llm = self.llm_mini.with_structured_output(PythonLibraries)
+        return structured_llm.invoke(task).libraries
+
+    def locate_module_files(self, lib: str) -> Path:
+        """Identify locations where module scripts are stored."""
+        imported = importlib.import_module(lib)
+        if imported.__file__:
+            return Path(imported.__file__).parent
+        msg = f"'{lib}' imported but not found."
+        raise ModuleImportedButNotLocatedError(msg)
+
+    def identify_documentation(self, libraries: list[str]) -> dict[str, str]:
+        """Find files of software packages useful for the task, including docstrings."""
+         # TODO: generalise to cross-language. Package managers for key languages. Browser-based for other languages.
+         # UnimplementedError for languages not supported.
+        installed = {pkg.metadata["name"] for pkg in importlib.metadata.distributions()}
+        missing = set(libraries) - installed
+        if missing:
+            # install lib
+            python = sys.executable
+            subprocess.check_call([python, "-m", "pip", "install", *missing], stdout=subprocess.DEVNULL)
+        lib_documentation = {}
+        for lib in libraries:
+            lib_documentation[lib] = self.locate_module_files(lib=lib)
+        return lib_documentation
+
+    def indexed_data(self, rag_input: dict[str, str]) -> None:
+        """Prepare RAG-ready data from scripts in directories indicated in the input."""
+        file_description_dir = join_paths(self.work_dir, ".clean_coder/lib_documentation_descriptions")
+        file_extension_constraint = {
+            ".js",
+            ".jsx",
+            ".ts",
+            ".tsx",
+            ".vue",
+            ".py",
+            ".rb",
+            ".php",
+            ".java",
+            ".c",
+            ".cpp",
+            ".cs",
+            ".go",
+            ".swift",
+            ".kt",
+            ".rs",
+            ".htm",
+            ".html",
+            ".css",
+            ".scss",
+            ".sass",
+            ".less",
+            ".prompt",
+        }
+        ignore = {".clean_coder", ".coderrules"}
+        produce_descriptions(
+            directories_with_files_to_describe=list(rag_input.values()),
+            file_description_dir=file_description_dir,
+            work_dir=self.work_dir,
+            file_extension_constraint=file_extension_constraint,
+            ignore=ignore,
+            )
+        chroma_collection_name = f"clean_coder_{Path(self.work_dir).name}_lib_documentation_descriptions"
+        upload_descriptions_to_vdb(
+            chroma_collection_name=chroma_collection_name,
+            work_dir=self.work_dir,
+            file_description_dir=file_description_dir,
+        )
+
+
+    def rag_documentation(self, task: str) -> None | list[str]:
+        """Returns documentation relevant for the task set by human user, a list of files."""
+        libraries = self.identify_libraries(task=task)
+        rag_input = self.identify_documentation(libraries=libraries)
+        return self.indexed_data(rag_input=rag_input)
diff --git a/src/tools/rag/retrieval.py b/src/tools/rag/retrieval.py
index 603ca265..71fc7bc6 100644
--- a/src/tools/rag/retrieval.py
+++ b/src/tools/rag/retrieval.py
@@ -1,19 +1,26 @@
+"""Functions to retrieve the most relevant documents from an indexed RAG database."""
 import os
-import cohere
-import chromadb
 from pathlib import Path
-from dotenv import load_dotenv, find_dotenv
 
+import chromadb
+import cohere
+from dotenv import find_dotenv, load_dotenv
+
+from src.utilities.exceptions import MissingEnvironmentVariableError
 
 load_dotenv(find_dotenv())
 work_dir = os.getenv("WORK_DIR")
+if not work_dir:
+    msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file"
+    raise MissingEnvironmentVariableError(msg)
 cohere_key = os.getenv("COHERE_API_KEY")
 if cohere_key:
     cohere_client = cohere.Client(cohere_key)
 collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions"
 
 
-def get_collection():
+def get_collection() -> bool | chromadb.PersistentClient:
+    """Check if chroma database is available in WORK_DIR."""
     if cohere_key:
         chroma_client = chromadb.PersistentClient(path=os.getenv('WORK_DIR') + '/.clean_coder/chroma_base')
         try:
@@ -28,7 +35,8 @@ def vdb_available():
     return True if get_collection() else False
 
 
-def retrieve(question):
+def retrieve(question: str) -> str:
+    """Identifies the most relevant files that help answer a question."""
     # collection should be initialized once, in the class init
     collection = get_collection()
     retrieval = collection.query(query_texts=[question], n_results=8)
diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py
index 4518593e..22a5e3ad 100644
--- a/src/tools/rag/write_descriptions.py
+++ b/src/tools/rag/write_descriptions.py
@@ -1,123 +1,265 @@
+"""Functions to create an index of files for RAG."""
+
+import logging
 import os
+import sys
 from pathlib import Path
+
+import chromadb
+from dotenv import find_dotenv, load_dotenv
 from langchain.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-from dotenv import load_dotenv, find_dotenv
-import chromadb
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')))
-from src.utilities.util_functions import join_paths, read_coderrules
-from src.utilities.start_work_functions import CoderIgnore, file_folder_ignored
-from src.utilities.llms import init_llms_mini
+from langchain_core.runnables.base import RunnableSequence
 
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "..")))
+from src.utilities.exceptions import MissingEnvironmentVariableError
+from src.utilities.llms import init_llms_mini
+from src.utilities.start_work_functions import file_folder_ignored
+from src.utilities.util_functions import join_paths
 
-load_dotenv(find_dotenv())
-work_dir = os.getenv("WORK_DIR")
+## Configure the logging level
+logging.basicConfig(level=logging.INFO)
 
 
-def is_code_file(file_path):
+def relevant_extension(file_path: Path, file_extension_constraint: set[str]) -> bool:
+    """Checker for whether file extension indicates a script."""
     # List of common code file extensions
-    code_extensions = {
-        '.js', '.jsx', '.ts', '.tsx', '.vue', '.py', '.rb', '.php', '.java', '.c', '.cpp', '.cs', '.go', '.swift',
-        '.kt', '.rs', '.htm','.html', '.css', '.scss', '.sass', '.less', '.prompt',
-    }
-    return file_path.suffix.lower() in code_extensions
+    return file_path.suffix.lower() in file_extension_constraint
 
 
 # read file content. place name of file in the top
-def get_content(file_path):
-    with open(file_path, 'r', encoding='utf-8') as file:
+def get_content(file_path: Path) -> str:
+    """Collect file name and content to return them together as string."""
+    with open(file_path, encoding="utf-8") as file:
         content = file.read()
-    content = file_path.name + '\n' + content
-    return content
+    return file_path.name + "\n" + content
 
-def collect_file_pathes(subfolders, work_dir):
-    """
-    Collect and return a list of allowed code files from the given subfolders
-    under the work_dir according to is_code_file criteria and .coderignore patterns.
-    """
-    allowed_files = []
-    for folder in subfolders:
-        for root, _, files in os.walk(work_dir + folder):
-            for file in files:
-                file_path = Path(root) / file
-                if not is_code_file(file_path):
-                    continue
-                relative_path_str = file_path.relative_to(work_dir).as_posix()
-                if file_folder_ignored(relative_path_str):
-                    continue
-                allowed_files.append(file_path)
-    return allowed_files
 
+def add_to_indexing_if_relevant(root: str, file: str, file_extension_constraint: set[str] | None) -> Path | None:
+    """Return file path if the file is to be considered."""
+    file_path = Path(root).joinpath(file)
+    if file_folder_ignored(str(file_path)):
+        # ignore files and folders mentioned in .coderignore
+        return None
+    if not file_extension_constraint:
+        return file_path
+    if relevant_extension(
+        file_path, file_extension_constraint=file_extension_constraint,
+    ):
+        return file_path
+    return None
 
-def write_descriptions(subfolders_with_files=['/']):
-    all_files = collect_file_pathes(subfolders_with_files, work_dir)
 
-    coderrules = read_coderrules()
-
-    prompt = ChatPromptTemplate.from_template(
-f"""First, get known with info about project (may be useful, may be not):
-
-'''
-{coderrules}
-'''
+def files_in_directory(
+    directories_with_files_to_describe: list[str | Path],
+    file_extension_constraint: set[str] | None,
+) -> list[Path]:
+    """Fetch paths of files in directory."""
+    files_to_describe = []
+    for directory in directories_with_files_to_describe:
+        directory_files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
+        tmp = [
+            add_to_indexing_if_relevant(
+                root=str(directory),
+                file=file,
+                file_extension_constraint=file_extension_constraint,
+            )
+            for file in directory_files
+        ]
+        files_to_describe.extend(tmp)
+        for root, _, files in os.walk(directory):
+            tmp = [
+                add_to_indexing_if_relevant(
+                    root=root,
+                    file=file,
+                    file_extension_constraint=file_extension_constraint,
+                )
+                for file in files
+            ]
+            files_to_describe.extend(tmp)
+    return files_to_describe
 
-Describe the code in 4 sentences or less, focusing only on important information from integration point of view.
-Write what file is responsible for.
 
-Go traight to the thing in description, without starting sentence.
+def save_file_description(file_path: Path, description: str, file_description_dir: str) -> None:
+    """Save file description."""
+    if not work_dir:
+        msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file"
+        raise MissingEnvironmentVariableError(msg)
+    file_name = file_path.relative_to(work_dir).as_posix().replace("/", "=")
+    output_path = join_paths(file_description_dir, f"{file_name}.txt")
+    with open(output_path, "w", encoding="utf-8") as out_file:
+        out_file.write(description)
 
-'''
-{{code}}
-'''
-"""
-    )
-    llms = init_llms_mini(tools=[], run_name='File Describer')
-    llm = llms[0]
-    chain = prompt | llm | StrOutputParser()
 
-    description_folder = join_paths(work_dir, '.clean_coder/files_and_folders_descriptions')
-    Path(description_folder).mkdir(parents=True, exist_ok=True)
+def output_descriptions(
+    files_to_describe: list[Path], chain: RunnableSequence, file_description_dir: str,
+) -> None:
+    """Generate & output file descriptions to designated directory in WORK_DIR."""
     # iterate over all files, take 8 files at once
     batch_size = 8
-    for i in range(0, len(all_files), batch_size):
-        files_iteration = all_files[i:i + batch_size]
+    for i in range(0, len(files_to_describe), batch_size):
+        files_iteration = [f for f in files_to_describe[i : i + batch_size] if f is not None]
         descriptions = chain.batch([get_content(file_path) for file_path in files_iteration])
-        print(descriptions)
+        logging.debug(descriptions)
+        [
+            save_file_description(
+                file_path=file_path,
+                description=description,
+                file_description_dir=file_description_dir,
+            )
+            for file_path, description in zip(files_iteration, descriptions, strict=True)
+        ]
 
-        for file_path, description in zip(files_iteration, descriptions):
-            file_name = file_path.relative_to(work_dir).as_posix().replace('/', '=')
-            output_path = join_paths(description_folder, f"{file_name}.txt")
 
-            with open(output_path, 'w', encoding='utf-8') as out_file:
-                out_file.write(description)
+def produce_descriptions(
+    directories_with_files_to_describe: list[str | Path],
+    file_description_dir: str,
+    file_extension_constraint: set[str] | None = None,
+) -> None:
+    """
+    Produce short descriptions of files. Store the descriptions in .clean_coder folder in WORK_DIR.
+
+    Inputs:
+        directories_with_files_to_describe: directories from which files are to be described.
+        file_description_dir: directory where generated file descriptions are to be saved to.
+        ignore: files and folders to ignore.
+        file_extension_constraint: The list of file extension types accepted, if it's provided.
 
+    Example:
+        work_dir = os.getenv("WORK_DIR") # provide your own directory of choice if WORK_DIR is not set.
+        if not work_dir:
+            msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file"
+            raise MissingEnvironmentVariableError(msg)
+        file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions")
+        file_extension_constraint = {
+            ".js", ".jsx", ".ts", ".tsx", ".vue", ".py", ".rb", ".php", ".java", ".c", ".cpp", ".cs", ".go", ".swift",
+            ".kt", ".rs", ".htm",".html", ".css", ".scss", ".sass", ".less", ".prompt",
+        }
+        ignore = {".clean_coder", ".coderrules"}
+        produce_descriptions(directories_with_files_to_describe=[work_dir],
+                        file_description_dir=file_description_dir,
+                        file_extension_constraint=file_extension_constraint,
+                        )
+    """
+    files_to_describe = files_in_directory(
+        directories_with_files_to_describe=directories_with_files_to_describe,
+        file_extension_constraint=file_extension_constraint,
+    )
 
-def upload_descriptions_to_vdb():
-    chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, '.clean_coder/chroma_base'))
-    collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions"
+    prompt = ChatPromptTemplate.from_template(
+        """Describe the following code in 4 sentences or less, focusing only on important information from integration point of view.
+    Write what file is responsible for.\n\n'''\n{code}'''
+    """,
+    )
 
-    collection = chroma_client.get_or_create_collection(
-        name=collection_name
+    llms = init_llms_mini(tools=[], run_name="File Describer")
+    llm = llms[0]
+    chain = prompt | llm | StrOutputParser()
+    Path(file_description_dir).mkdir(parents=True, exist_ok=True)
+    output_descriptions(
+        files_to_describe=files_to_describe, chain=chain, file_description_dir=file_description_dir
     )
 
-    # read files and upload to base
-    description_folder = join_paths(work_dir, '.clean_coder/files_and_folders_descriptions')
-    for root, _, files in os.walk(description_folder):
+
+def upload_to_collection(collection: chromadb.PersistentClient, file_description_dir: str) -> None:
+    """Insert file information to chroma database."""
+    for root, _, files in os.walk(file_description_dir):
         for file in files:
             file_path = Path(root) / file
-            with open(file_path, 'r', encoding='utf-8') as file:
-                content = file.read()
+            with open(file_path, encoding="utf-8") as f:
+                content = f.read()
             collection.upsert(
                 documents=[
-                    content
+                    content,
                 ],
-                ids=[file_path.name.replace('=', '/').removesuffix(".txt")],
+                ids=[file_path.name.replace("=", "/").removesuffix(".txt")],
             )
 
 
-if __name__ == '__main__':
-    #provide optionally which subfolders needs to be checked, if you don't want to describe all project folder
-    write_descriptions(subfolders_with_files=['/'])
+def upload_descriptions_to_vdb(
+    chroma_collection_name: str,
+    file_description_dir: str,
+    vdb_location: str = ".clean_coder/chroma_base",
+) -> None:
+    """
+    Upload file descriptions to chroma database.
+
+    Inputs:
+        chroma_collection_name: name of the collection within Chroma vector database to save file descriptions in.
+        file_description_dir: directory where generated file descriptions are available.
+        vdb_location: (optional) location for storing the vector database.
 
-    upload_descriptions_to_vdb()
+    Example:
+        work_dir = os.getenv("WORK_DIR") # provide your own directory of choice if WORK_DIR is not set.
+        if not work_dir:
+            msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file"
+            raise MissingEnvironmentVariableError(msg)
+        file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions")
+        file_extension_constraint = {
+            ".js", ".jsx", ".ts", ".tsx", ".vue", ".py", ".rb", ".php", ".java", ".c", ".cpp", ".cs", ".go", ".swift",
+            ".kt", ".rs", ".htm",".html", ".css", ".scss", ".sass", ".less", ".prompt",
+        }
+        ignore = {".clean_coder", ".coderrules"}
+        produce_descriptions(directories_with_files_to_describe=[work_dir],
+                        file_description_dir=file_description_dir,
+                        file_extension_constraint=file_extension_constraint,
+                        )
+        chroma_collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions"
+        upload_descriptions_to_vdb(chroma_collection_name=chroma_collection_name, file_description_dir=file_description_dir)
+    """
+    work_dir = os.getenv("WORK_DIR")
+    if not work_dir:
+        msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file"
+        raise MissingEnvironmentVariableError(msg)
+    chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, vdb_location))
+    collection = chroma_client.get_or_create_collection(
+        name=chroma_collection_name,
+    )
+
+    # read files and upload to base
+    upload_to_collection(collection=collection, file_description_dir=file_description_dir)
+
+
+if __name__ == "__main__":
+    # provide optionally which subfolders needs to be checked, if you don't want to describe all project folder
+    # load environment
+    load_dotenv(find_dotenv())
+    work_dir = os.getenv("WORK_DIR")
+    if not work_dir:
+        msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file"
+        raise MissingEnvironmentVariableError(msg)
+    file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions")
+    file_extension_constraint = {
+        ".js",
+        ".jsx",
+        ".ts",
+        ".tsx",
+        ".vue",
+        ".py",
+        ".rb",
+        ".php",
+        ".java",
+        ".c",
+        ".cpp",
+        ".cs",
+        ".go",
+        ".swift",
+        ".kt",
+        ".rs",
+        ".htm",
+        ".html",
+        ".css",
+        ".scss",
+        ".sass",
+        ".less",
+        ".prompt",
+    }
+    produce_descriptions(
+        directories_with_files_to_describe=[work_dir],
+        file_description_dir=file_description_dir,
+        file_extension_constraint=file_extension_constraint,
+    )
+    chroma_collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions"
+    upload_descriptions_to_vdb(
+        chroma_collection_name=chroma_collection_name, file_description_dir=file_description_dir,
+    )
diff --git a/src/tools/tools_doc_harvester.py b/src/tools/tools_doc_harvester.py
new file mode 100644
index 00000000..234bfb1b
--- /dev/null
+++ b/src/tools/tools_doc_harvester.py
@@ -0,0 +1,16 @@
+"""Tolls for doc harvester."""
+from pydantic import BaseModel, Field
+
+
+class PythonLibraries(BaseModel):
+    """Identify python libraries which are relevant to the user's task. No fewer than three."""
+
+    libraries: list[str] = Field("The list of libraries.")
+
+
+
+if __name__ == "__main__":
+    from langchain_openai import ChatOpenAI
+    llm = ChatOpenAI(model="gpt-4o-mini")
+    structured_llm = llm.with_structured_output(PythonLibraries)
+    structured_llm.invoke("I want to create a website scraper for abcnews.com. I want to use playwright for the scraping")
diff --git a/src/utilities/exceptions.py b/src/utilities/exceptions.py
new file mode 100644
index 00000000..b9658d7f
--- /dev/null
+++ b/src/utilities/exceptions.py
@@ -0,0 +1,7 @@
+"""Custom esception messages."""
+
+class MissingEnvironmentVariableError(Exception):
+    """Enviromental variable missing."""
+
+class ModuleImportedButNotLocatedError(Exception):
+    """A module was imported but the location of its source files is unknown."""
diff --git a/src/utilities/llms.py b/src/utilities/llms.py
index 3885dcc9..b9fa687c 100644
--- a/src/utilities/llms.py
+++ b/src/utilities/llms.py
@@ -1,11 +1,15 @@
-from langchain_openai.chat_models import ChatOpenAI as ChatOpenRouter
-from langchain_openai.chat_models import ChatOpenAI as ChatLocalModel
-from os import getenv
+"""Utilities for loading LLMs."""
 import os
+from collections.abc import Callable
+from os import getenv
+
 from dotenv import load_dotenv
-from langchain_openai.chat_models import ChatOpenAI
 from langchain_anthropic import ChatAnthropic
 from langchain_ollama import ChatOllama
+from langchain_openai.chat_models import ChatOpenAI
+from langchain_openai.chat_models import ChatOpenAI as ChatLocalModel
+from langchain_openai.chat_models import ChatOpenAI as ChatOpenRouter
+
 #from langchain_google_genai import ChatGoogleGenerativeAI
 
 load_dotenv()
@@ -23,7 +27,8 @@ def llm_open_router(model):
     timeout=60,
 )
 
-def llm_open_local_hosted(model):
+def llm_open_local_hosted(model: str) -> Callable:
+    """Return a locally hosted model."""
     return ChatLocalModel(
     openai_api_key="n/a",
     openai_api_base=getenv("LOCAL_MODEL_API_BASE"),
@@ -31,10 +36,20 @@ def llm_open_local_hosted(model):
     timeout=90,
 )
 
-def init_llms(tools=None, run_name="Clean Coder", temp=0):
+def llms_with_tools_and_config(llms: list[Callable], tools: list[Callable] | None, run_name: str) -> list[Callable]:
+    """Adds tools and config to loaded llms."""
+    for i, llm in enumerate(llms):
+        if tools:
+            llm = llm.bind_tools(tools)
+        llms[i] = llm.with_config({"run_name": run_name})
+    return llms
+
+
+def init_llms(tools: None | list[Callable] =None, run_name: str = "Clean Coder", temp: float = 0) -> list[Callable]:
+    """Returns available mid-sized LLM models, with tools when available and config."""
     llms = []
     if getenv("ANTHROPIC_API_KEY"):
-        llms.append(ChatAnthropic(model='claude-3-5-sonnet-20241022', temperature=temp, timeout=60, max_tokens=2048))
+        llms.append(ChatAnthropic(model="claude-3-5-sonnet-20241022", temperature=temp, timeout=60, max_tokens=2048))
     if getenv("OPENROUTER_API_KEY"):
         llms.append(llm_open_router("anthropic/claude-3.5-sonnet"))
     if getenv("OPENAI_API_KEY"):
@@ -43,19 +58,16 @@ def init_llms(tools=None, run_name="Clean Coder", temp=0):
     #     llms.append(ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", temperature=temp, timeout=60))
     if getenv("OLLAMA_MODEL"):
         llms.append(ChatOllama(model=os.getenv("OLLAMA_MODEL")))
-    if getenv("LOCAL_MODEL_API_BASE"):
+    if getenv("LOCAL_MODEL_API_BASE") and getenv("LOCAL_MODEL_NAME"):
         llms.append(llm_open_local_hosted(getenv("LOCAL_MODEL_NAME")))
-    for i, llm in enumerate(llms):
-        if tools:
-            llm = llm.bind_tools(tools)
-        llms[i] = llm.with_config({"run_name": run_name})
-    return llms
+    return llms_with_tools_and_config(llms=llms, tools=tools, run_name=run_name)
 
 
-def init_llms_mini(tools=None, run_name="Clean Coder", temp=0):
+def init_llms_mini(tools: None | list[Callable] =None, run_name: str = "Clean Coder", temp: float=0) -> list[Callable]:
+    """Returns available small LLM models, with tools when available and config."""
     llms = []
     if os.getenv("ANTHROPIC_API_KEY"):
-        llms.append(ChatAnthropic(model='claude-3-5-haiku-20241022', temperature=temp, timeout=60))
+        llms.append(ChatAnthropic(model="claude-3-5-haiku-20241022", temperature=temp, timeout=60))
     if os.getenv("OPENROUTER_API_KEY"):
         llms.append(llm_open_router("anthropic/claude-3.5-haiku"))
     if os.getenv("OPENAI_API_KEY"):
@@ -64,16 +76,13 @@ def init_llms_mini(tools=None, run_name="Clean Coder", temp=0):
     #     llms.append(ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", temperature=temp, timeout=60))
     if os.getenv("OLLAMA_MODEL"):
         llms.append(ChatOllama(model=os.getenv("OLLAMA_MODEL")))
-    if getenv("LOCAL_MODEL_API_BASE"):
+    if getenv("LOCAL_MODEL_API_BASE") and getenv("LOCAL_MODEL_NAME"):
         llms.append(llm_open_local_hosted(getenv("LOCAL_MODEL_NAME")))
-    for i, llm in enumerate(llms):
-        if tools:
-            llm = llm.bind_tools(tools)
-        llms[i] = llm.with_config({"run_name": run_name})
-    return llms
+    return llms_with_tools_and_config(llms=llms, tools=tools, run_name=run_name)
 
 
-def init_llms_high_intelligence(tools=None, run_name="Clean Coder", temp=0.2):
+def init_llms_high_intelligence(tools: None | list[Callable] =None, run_name: str = "Clean Coder", temp: float=0.2) -> list[Callable]:
+    """Returns available high power LLM models, with tools when available and config."""
     llms = []
     if os.getenv("OPENAI_API_KEY"):
         llms.append(ChatOpenAI(model="o3-mini", temperature=1, timeout=60, reasoning_effort="high"))
@@ -89,10 +98,6 @@ def init_llms_high_intelligence(tools=None, run_name="Clean Coder", temp=0.2):
     #     llms.append(ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", temperature=temp, timeout=60))
     if os.getenv("OLLAMA_MODEL"):
         llms.append(ChatOllama(model=os.getenv("OLLAMA_MODEL")))
-    if getenv("LOCAL_MODEL_API_BASE"):
+    if getenv("LOCAL_MODEL_API_BASE") and getenv("LOCAL_MODEL_NAME"):
         llms.append(llm_open_local_hosted(getenv("LOCAL_MODEL_NAME")))
-    for i, llm in enumerate(llms):
-        if tools:
-            llm = llm.bind_tools(tools)
-        llms[i] = llm.with_config({"run_name": run_name})
-    return llms
\ No newline at end of file
+    return llms_with_tools_and_config(llms=llms, tools=tools, run_name=run_name)