feat: add logic for repo setup + code indexing (#4626)

sahil485 · web-flow · commit 34e0bcd9ca2b · 2025-10-31T02:02:47.000-04:00
diff --git a/servers/fai-lambda/fai-code-indexing/poetry.lock b/servers/fai-lambda/fai-code-indexing/poetry.lock
diff --git a/servers/fai-lambda/fai-code-indexing/pyproject.toml b/servers/fai-lambda/fai-code-indexing/pyproject.toml
@@ -7,6 +7,9 @@ packages = [{include = "src"}]
 
 [tool.poetry.dependencies]
 python = ">=3.11,<4.0"
+claude-agent-sdk = "*"
+httpx = "*"
+pydantic = "*"
 
 [tool.poetry.group.dev.dependencies]
 aiosqlite = "^0.21.0"
diff --git a/servers/fai-lambda/fai-code-indexing/src/handler.py b/servers/fai-lambda/fai-code-indexing/src/handler.py
@@ -1,64 +1,25 @@
 import asyncio
 import json
 import logging
-import uuid
 from datetime import (
     UTC,
     datetime,
 )
-from typing import Any
+from typing import (
+    Any,
+    Literal,
+)
 
 from shared.utils.validation import validate_body_param_or_throw
 
-from .utils.git import clone_repo
+from .operations import (
+    run_code_search_tool_call,
+    setup_repo_for_domain,
+)
 
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
 
-INDEXING_SYSTEM_PROMPT = """You are a code indexing assistant. \
-Your task is to analyze code repositories and extract relevant information \
-for documentation and search purposes."""
-
-
-async def handle_indexing_request(
-    repository: str,
-) -> dict[str, Any]:
-    """Handle code indexing request.
-
-    Args:
-        repository: GitHub repository in format 'owner/repo'
-
-    Returns:
-        Dictionary with indexing results
-    """
-    session_id = str(uuid.uuid4())
-    logger.info(f"Starting indexing session {session_id} for repository: {repository}")
-
-    repo_path = clone_repo(repository=repository, session_id=session_id)
-    logger.info(f"Repository cloned to: {repo_path}")
-
-    #     user_prompt = f"""Analyze the codebase at {repo_path} and provide a summary of:
-    # 1. Main programming languages used
-    # 2. Project structure and key directories
-    # 3. Entry points and main files
-    # 4. Dependencies and package managers used
-    # """
-
-    #     claude_session_id = await run_indexing_session(
-    #         repo_path=repo_path,
-    #         system_prompt=INDEXING_SYSTEM_PROMPT,
-    #         user_prompt=user_prompt,
-    #     )
-
-    #     logger.info(f"Indexing session completed: {claude_session_id}")
-
-    return {
-        "session_id": session_id,
-        # "claude_session_id": claude_session_id,
-        "repository": repository,
-        "status": "success",
-    }
-
 
 def handler(event: dict[str, Any], context: Any) -> dict[str, Any]:
     """Lambda handler for code indexing."""
@@ -67,15 +28,21 @@ def handler(event: dict[str, Any], context: Any) -> dict[str, Any]:
 
     try:
         body = json.loads(event.get("body", "{}"))
-        repository = validate_body_param_or_throw(body, "repository")
+        domain = validate_body_param_or_throw(body, "domain")
+        event_type: Literal["codeSearch", "indexRepo"] = validate_body_param_or_throw(body, "eventType")
 
-        result = asyncio.run(handle_indexing_request(repository=repository))
+        if event_type == "indexRepo":
+            repo_url = validate_body_param_or_throw(body, "repoUrl")
+            asyncio.run(setup_repo_for_domain(domain=domain, repo_url=repo_url))
+            message = "Repository indexed successfully"
+        elif event_type == "codeSearch":
+            asyncio.run(run_code_search_tool_call(domain=domain))
+            message = "Code search completed successfully"
 
         response_body = {
-            "message": "Indexing completed successfully",
+            "message": message,
             "timestamp": datetime.now(UTC).isoformat(),
             "requestId": context.aws_request_id,
-            "result": result,
         }
 
         return {
diff --git a/servers/fai-lambda/fai-code-indexing/src/models.py b/servers/fai-lambda/fai-code-indexing/src/models.py
@@ -0,0 +1,22 @@
+from typing import Literal
+
+from pydantic import BaseModel
+
+
+class AnalysisResult(BaseModel):
+    domain: str
+    session_id: str | None
+    status: Literal["success", "error"]
+    error: str | None
+
+
+class SetupRepoResult(BaseModel):
+    domain: str
+    session_id: str | None
+    status: Literal["success", "error"]
+    error: str | None
+
+
+class CodeSearchResult(BaseModel):
+    domain: str
+    status: Literal["success"]
diff --git a/servers/fai-lambda/fai-code-indexing/src/operations/__init__.py b/servers/fai-lambda/fai-code-indexing/src/operations/__init__.py
@@ -0,0 +1,5 @@
+from .analysis import analyze_repositories_for_domain
+from .indexing import setup_repo_for_domain
+from .search import run_code_search_tool_call
+
+__all__ = ["analyze_repositories_for_domain", "setup_repo_for_domain", "run_code_search_tool_call"]
diff --git a/servers/fai-lambda/fai-code-indexing/src/operations/analysis.py b/servers/fai-lambda/fai-code-indexing/src/operations/analysis.py
@@ -0,0 +1,75 @@
+import logging
+import os
+from pathlib import Path
+
+from claude_agent_sdk import (
+    AssistantMessage,
+    ClaudeAgentOptions,
+    TextBlock,
+    ToolUseBlock,
+    query,
+)
+
+from ..models import AnalysisResult
+
+logger = logging.getLogger()
+
+CLAUDE_ANALYST_SYSTEM_PROMPT = """
+You are an expert code analyst.
+"""
+
+CLAUDE_ANALYST_USER_PROMPT = """
+You are given a group of code repositories that are all associated with the same company/core product offering.
+
+Your Task:
+- Understand the code of each individual repository by exploring its architecture, language, purpose, and usage
+- Understand the potential use-cases of each repository
+
+DO NOT modify the code in any way.
+"""
+
+
+async def analyze_repositories_for_domain(domain: str) -> AnalysisResult:
+    """Analyze all repositories for a domain.
+
+    Args:
+        domain: The domain to analyze repositories for
+
+    Returns:
+        Dictionary with analysis results
+    """
+    logger.info(f"Analyzing repositories for domain: {domain}")
+
+    efs_root = Path(os.environ.get("HOME", "/mnt/efs"))
+    domain_folder = efs_root / domain
+
+    if not domain_folder.exists():
+        raise ValueError(f"Domain folder does not exist: {domain_folder}")
+
+    session_id: str | None = None
+
+    try:
+        async for message in query(
+            prompt=CLAUDE_ANALYST_USER_PROMPT,
+            options=ClaudeAgentOptions(
+                cwd=str(domain_folder),
+                system_prompt=CLAUDE_ANALYST_SYSTEM_PROMPT,
+                disallowed_tools=["Write", "Delete", "Rename"],
+            ),
+        ):
+            if hasattr(message, "subtype") and message.subtype == "init":
+                if hasattr(message, "data") and isinstance(message.data, dict):
+                    session_id = message.data.get("session_id")
+                    logger.info(f"Session started with ID: {session_id}")
+
+            if isinstance(message, AssistantMessage):
+                for content in message.content:
+                    if isinstance(content, ToolUseBlock):
+                        logger.info(f"Tool used: {content.name}")
+                    if isinstance(content, TextBlock):
+                        logger.info(f"{content.text}")
+    except Exception as e:
+        logger.error(f"Failed to analyze repositories for domain {domain}: {e}")
+        return AnalysisResult(domain=domain, session_id=None, status="error", error=str(e))
+
+    return AnalysisResult(domain=domain, session_id=session_id, status="success", error=None)
diff --git a/servers/fai-lambda/fai-code-indexing/src/operations/indexing.py b/servers/fai-lambda/fai-code-indexing/src/operations/indexing.py
@@ -0,0 +1,33 @@
+import logging
+
+from ..models import SetupRepoResult
+from ..utils.git import clone_repo_to_domain
+from .analysis import analyze_repositories_for_domain
+
+logger = logging.getLogger()
+
+
+async def setup_repo_for_domain(domain: str, repo_url: str) -> SetupRepoResult:
+    """Set up a repository for a domain by cloning and indexing it.
+
+    Args:
+        domain: The domain to associate the repository with
+        repo_url: The GitHub repository URL to clone and index
+
+    Returns:
+        Dictionary with setup results
+    """
+    logger.info(f"Setting up repository for domain: {domain}, repo: {repo_url}")
+
+    repo_path = clone_repo_to_domain(domain=domain, repo_url=repo_url)
+    logger.info(f"Repository cloned to: {repo_path}")
+
+    analysis_result = await analyze_repositories_for_domain(domain=domain)
+    logger.info(f"Analysis completed for domain {domain} repositories")
+
+    return SetupRepoResult(
+        domain=domain,
+        session_id=analysis_result.session_id,
+        status=analysis_result.status,
+        error=analysis_result.error,
+    )
diff --git a/servers/fai-lambda/fai-code-indexing/src/operations/search.py b/servers/fai-lambda/fai-code-indexing/src/operations/search.py
@@ -0,0 +1,23 @@
+import logging
+from typing import Any
+
+logger = logging.getLogger()
+
+
+async def run_code_search_tool_call(domain: str) -> dict[str, Any]:
+    """Run a code search tool call for a domain.
+
+    Args:
+        domain: The domain to search code for
+
+    Returns:
+        Dictionary with search results
+    """
+    logger.info(f"Running code search for domain: {domain}")
+
+    # TODO: Implement code search logic
+
+    return {
+        "domain": domain,
+        "status": "success",
+    }
diff --git a/servers/fai-lambda/fai-code-indexing/src/utils/git.py b/servers/fai-lambda/fai-code-indexing/src/utils/git.py
@@ -1,14 +1,72 @@
-from shared.utils.git import clone_repo as shared_clone_repo
+import logging
+import os
+import subprocess
+from pathlib import Path
 
+logger = logging.getLogger()
 
-def clone_repo(repository: str, session_id: str) -> str:
-    """Clone a GitHub repository into /tmp for indexing.
+
+def clone_repo_to_domain(domain: str, repo_url: str) -> str:
+    """Clone a GitHub repository into EFS under a domain folder.
 
     Args:
-        repository: GitHub repository in format 'owner/repo'
-        session_id: Unique identifier for this indexing session
+        domain: The domain to associate the repository with (e.g., 'hume.docs.buildwithfern.com')
+        repo_url: The GitHub repository URL or 'owner/repo' format
 
     Returns:
         Path to the cloned repository
     """
-    return shared_clone_repo(repository=repository, session_id=session_id, session_type="indexing")
+    github_token = os.environ.get("GITHUB_TOKEN")
+    efs_root = Path(os.environ.get("HOME", "/mnt/efs"))
+
+    domain_folder = efs_root / domain
+    domain_folder.mkdir(parents=True, exist_ok=True)
+
+    if repo_url.startswith("https://github.com/"):
+        repo_identifier = repo_url.replace("https://github.com/", "").replace(".git", "").rstrip("/")
+    else:
+        repo_identifier = repo_url.replace(".git", "")
+
+    repo_name = repo_identifier.split("/")[-1]
+    repo_path = domain_folder / repo_name
+
+    if repo_path.exists():
+        logger.info(f"Repository already exists at {repo_path}, pulling latest changes")
+        try:
+            subprocess.run(
+                ["git", "config", "--global", "--add", "safe.directory", str(repo_path)],
+                capture_output=True,
+                text=True,
+            )
+            subprocess.run(
+                ["git", "-C", str(repo_path), "fetch", "origin"],
+                check=True,
+                capture_output=True,
+                text=True,
+            )
+            subprocess.run(
+                ["git", "-C", str(repo_path), "pull", "origin"],
+                check=True,
+                capture_output=True,
+                text=True,
+            )
+            logger.info(f"Successfully pulled latest changes at {repo_path}")
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Failed to pull repository: {e.stderr}")
+            raise RuntimeError(f"Failed to pull latest changes: {e.stderr}")
+    else:
+        clone_url = f"https://x-access-token:{github_token}@github.com/{repo_identifier}.git"
+
+        logger.info(f"Cloning {repo_identifier} into {repo_path} (shallow)")
+        try:
+            subprocess.run(
+                ["git", "clone", "--depth", "1", clone_url, str(repo_path)],
+                check=True,
+                capture_output=True,
+                text=True,
+            )
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Failed to clone repository: {e.stderr}")
+            raise RuntimeError(f"Failed to clone {repo_identifier}: {e.stderr}")
+
+    return str(repo_path)