Skip to content

Commit 34e0bcd

Browse files
authored
feat: add logic for repo setup + code indexing (#4626)
1 parent 1888467 commit 34e0bcd

File tree

9 files changed

+1151
-63
lines changed

9 files changed

+1151
-63
lines changed

servers/fai-lambda/fai-code-indexing/poetry.lock

Lines changed: 908 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

servers/fai-lambda/fai-code-indexing/pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ packages = [{include = "src"}]
77

88
[tool.poetry.dependencies]
99
python = ">=3.11,<4.0"
10+
claude-agent-sdk = "*"
11+
httpx = "*"
12+
pydantic = "*"
1013

1114
[tool.poetry.group.dev.dependencies]
1215
aiosqlite = "^0.21.0"

servers/fai-lambda/fai-code-indexing/src/handler.py

Lines changed: 18 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,25 @@
11
import asyncio
22
import json
33
import logging
4-
import uuid
54
from datetime import (
65
UTC,
76
datetime,
87
)
9-
from typing import Any
8+
from typing import (
9+
Any,
10+
Literal,
11+
)
1012

1113
from shared.utils.validation import validate_body_param_or_throw
1214

13-
from .utils.git import clone_repo
15+
from .operations import (
16+
run_code_search_tool_call,
17+
setup_repo_for_domain,
18+
)
1419

1520
logger = logging.getLogger()
1621
logger.setLevel(logging.INFO)
1722

18-
INDEXING_SYSTEM_PROMPT = """You are a code indexing assistant. \
19-
Your task is to analyze code repositories and extract relevant information \
20-
for documentation and search purposes."""
21-
22-
23-
async def handle_indexing_request(
24-
repository: str,
25-
) -> dict[str, Any]:
26-
"""Handle code indexing request.
27-
28-
Args:
29-
repository: GitHub repository in format 'owner/repo'
30-
31-
Returns:
32-
Dictionary with indexing results
33-
"""
34-
session_id = str(uuid.uuid4())
35-
logger.info(f"Starting indexing session {session_id} for repository: {repository}")
36-
37-
repo_path = clone_repo(repository=repository, session_id=session_id)
38-
logger.info(f"Repository cloned to: {repo_path}")
39-
40-
# user_prompt = f"""Analyze the codebase at {repo_path} and provide a summary of:
41-
# 1. Main programming languages used
42-
# 2. Project structure and key directories
43-
# 3. Entry points and main files
44-
# 4. Dependencies and package managers used
45-
# """
46-
47-
# claude_session_id = await run_indexing_session(
48-
# repo_path=repo_path,
49-
# system_prompt=INDEXING_SYSTEM_PROMPT,
50-
# user_prompt=user_prompt,
51-
# )
52-
53-
# logger.info(f"Indexing session completed: {claude_session_id}")
54-
55-
return {
56-
"session_id": session_id,
57-
# "claude_session_id": claude_session_id,
58-
"repository": repository,
59-
"status": "success",
60-
}
61-
6223

6324
def handler(event: dict[str, Any], context: Any) -> dict[str, Any]:
6425
"""Lambda handler for code indexing."""
@@ -67,15 +28,21 @@ def handler(event: dict[str, Any], context: Any) -> dict[str, Any]:
6728

6829
try:
6930
body = json.loads(event.get("body", "{}"))
70-
repository = validate_body_param_or_throw(body, "repository")
31+
domain = validate_body_param_or_throw(body, "domain")
32+
event_type: Literal["codeSearch", "indexRepo"] = validate_body_param_or_throw(body, "eventType")
7133

72-
result = asyncio.run(handle_indexing_request(repository=repository))
34+
if event_type == "indexRepo":
35+
repo_url = validate_body_param_or_throw(body, "repoUrl")
36+
asyncio.run(setup_repo_for_domain(domain=domain, repo_url=repo_url))
37+
message = "Repository indexed successfully"
38+
elif event_type == "codeSearch":
39+
asyncio.run(run_code_search_tool_call(domain=domain))
40+
message = "Code search completed successfully"
7341

7442
response_body = {
75-
"message": "Indexing completed successfully",
43+
"message": message,
7644
"timestamp": datetime.now(UTC).isoformat(),
7745
"requestId": context.aws_request_id,
78-
"result": result,
7946
}
8047

8148
return {
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from typing import Literal
2+
3+
from pydantic import BaseModel
4+
5+
6+
class AnalysisResult(BaseModel):
7+
domain: str
8+
session_id: str | None
9+
status: Literal["success", "error"]
10+
error: str | None
11+
12+
13+
class SetupRepoResult(BaseModel):
14+
domain: str
15+
session_id: str | None
16+
status: Literal["success", "error"]
17+
error: str | None
18+
19+
20+
class CodeSearchResult(BaseModel):
21+
domain: str
22+
status: Literal["success"]
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from .analysis import analyze_repositories_for_domain
2+
from .indexing import setup_repo_for_domain
3+
from .search import run_code_search_tool_call
4+
5+
__all__ = ["analyze_repositories_for_domain", "setup_repo_for_domain", "run_code_search_tool_call"]
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import logging
2+
import os
3+
from pathlib import Path
4+
5+
from claude_agent_sdk import (
6+
AssistantMessage,
7+
ClaudeAgentOptions,
8+
TextBlock,
9+
ToolUseBlock,
10+
query,
11+
)
12+
13+
from ..models import AnalysisResult
14+
15+
logger = logging.getLogger()
16+
17+
CLAUDE_ANALYST_SYSTEM_PROMPT = """
18+
You are an expert code analyst.
19+
"""
20+
21+
CLAUDE_ANALYST_USER_PROMPT = """
22+
You are given a group of code repositories that are all associated with the same company/core product offering.
23+
24+
Your Task:
25+
- Understand the code of each individual repository by exploring its architecture, language, purpose, and usage
26+
- Understand the potential use-cases of each repository
27+
28+
DO NOT modify the code in any way.
29+
"""
30+
31+
32+
async def analyze_repositories_for_domain(domain: str) -> AnalysisResult:
33+
"""Analyze all repositories for a domain.
34+
35+
Args:
36+
domain: The domain to analyze repositories for
37+
38+
Returns:
39+
Dictionary with analysis results
40+
"""
41+
logger.info(f"Analyzing repositories for domain: {domain}")
42+
43+
efs_root = Path(os.environ.get("HOME", "/mnt/efs"))
44+
domain_folder = efs_root / domain
45+
46+
if not domain_folder.exists():
47+
raise ValueError(f"Domain folder does not exist: {domain_folder}")
48+
49+
session_id: str | None = None
50+
51+
try:
52+
async for message in query(
53+
prompt=CLAUDE_ANALYST_USER_PROMPT,
54+
options=ClaudeAgentOptions(
55+
cwd=str(domain_folder),
56+
system_prompt=CLAUDE_ANALYST_SYSTEM_PROMPT,
57+
disallowed_tools=["Write", "Delete", "Rename"],
58+
),
59+
):
60+
if hasattr(message, "subtype") and message.subtype == "init":
61+
if hasattr(message, "data") and isinstance(message.data, dict):
62+
session_id = message.data.get("session_id")
63+
logger.info(f"Session started with ID: {session_id}")
64+
65+
if isinstance(message, AssistantMessage):
66+
for content in message.content:
67+
if isinstance(content, ToolUseBlock):
68+
logger.info(f"Tool used: {content.name}")
69+
if isinstance(content, TextBlock):
70+
logger.info(f"{content.text}")
71+
except Exception as e:
72+
logger.error(f"Failed to analyze repositories for domain {domain}: {e}")
73+
return AnalysisResult(domain=domain, session_id=None, status="error", error=str(e))
74+
75+
return AnalysisResult(domain=domain, session_id=session_id, status="success", error=None)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import logging
2+
3+
from ..models import SetupRepoResult
4+
from ..utils.git import clone_repo_to_domain
5+
from .analysis import analyze_repositories_for_domain
6+
7+
logger = logging.getLogger()
8+
9+
10+
async def setup_repo_for_domain(domain: str, repo_url: str) -> SetupRepoResult:
11+
"""Set up a repository for a domain by cloning and indexing it.
12+
13+
Args:
14+
domain: The domain to associate the repository with
15+
repo_url: The GitHub repository URL to clone and index
16+
17+
Returns:
18+
Dictionary with setup results
19+
"""
20+
logger.info(f"Setting up repository for domain: {domain}, repo: {repo_url}")
21+
22+
repo_path = clone_repo_to_domain(domain=domain, repo_url=repo_url)
23+
logger.info(f"Repository cloned to: {repo_path}")
24+
25+
analysis_result = await analyze_repositories_for_domain(domain=domain)
26+
logger.info(f"Analysis completed for domain {domain} repositories")
27+
28+
return SetupRepoResult(
29+
domain=domain,
30+
session_id=analysis_result.session_id,
31+
status=analysis_result.status,
32+
error=analysis_result.error,
33+
)
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import logging
2+
from typing import Any
3+
4+
logger = logging.getLogger()
5+
6+
7+
async def run_code_search_tool_call(domain: str) -> dict[str, Any]:
8+
"""Run a code search tool call for a domain.
9+
10+
Args:
11+
domain: The domain to search code for
12+
13+
Returns:
14+
Dictionary with search results
15+
"""
16+
logger.info(f"Running code search for domain: {domain}")
17+
18+
# TODO: Implement code search logic
19+
20+
return {
21+
"domain": domain,
22+
"status": "success",
23+
}
Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,72 @@
1-
from shared.utils.git import clone_repo as shared_clone_repo
1+
import logging
2+
import os
3+
import subprocess
4+
from pathlib import Path
25

6+
logger = logging.getLogger()
37

4-
def clone_repo(repository: str, session_id: str) -> str:
5-
"""Clone a GitHub repository into /tmp for indexing.
8+
9+
def clone_repo_to_domain(domain: str, repo_url: str) -> str:
10+
"""Clone a GitHub repository into EFS under a domain folder.
611
712
Args:
8-
repository: GitHub repository in format 'owner/repo'
9-
session_id: Unique identifier for this indexing session
13+
domain: The domain to associate the repository with (e.g., 'hume.docs.buildwithfern.com')
14+
repo_url: The GitHub repository URL or 'owner/repo' format
1015
1116
Returns:
1217
Path to the cloned repository
1318
"""
14-
return shared_clone_repo(repository=repository, session_id=session_id, session_type="indexing")
19+
github_token = os.environ.get("GITHUB_TOKEN")
20+
efs_root = Path(os.environ.get("HOME", "/mnt/efs"))
21+
22+
domain_folder = efs_root / domain
23+
domain_folder.mkdir(parents=True, exist_ok=True)
24+
25+
if repo_url.startswith("https://github.com/"):
26+
repo_identifier = repo_url.replace("https://github.com/", "").replace(".git", "").rstrip("/")
27+
else:
28+
repo_identifier = repo_url.replace(".git", "")
29+
30+
repo_name = repo_identifier.split("/")[-1]
31+
repo_path = domain_folder / repo_name
32+
33+
if repo_path.exists():
34+
logger.info(f"Repository already exists at {repo_path}, pulling latest changes")
35+
try:
36+
subprocess.run(
37+
["git", "config", "--global", "--add", "safe.directory", str(repo_path)],
38+
capture_output=True,
39+
text=True,
40+
)
41+
subprocess.run(
42+
["git", "-C", str(repo_path), "fetch", "origin"],
43+
check=True,
44+
capture_output=True,
45+
text=True,
46+
)
47+
subprocess.run(
48+
["git", "-C", str(repo_path), "pull", "origin"],
49+
check=True,
50+
capture_output=True,
51+
text=True,
52+
)
53+
logger.info(f"Successfully pulled latest changes at {repo_path}")
54+
except subprocess.CalledProcessError as e:
55+
logger.error(f"Failed to pull repository: {e.stderr}")
56+
raise RuntimeError(f"Failed to pull latest changes: {e.stderr}")
57+
else:
58+
clone_url = f"https://x-access-token:{github_token}@github.com/{repo_identifier}.git"
59+
60+
logger.info(f"Cloning {repo_identifier} into {repo_path} (shallow)")
61+
try:
62+
subprocess.run(
63+
["git", "clone", "--depth", "1", clone_url, str(repo_path)],
64+
check=True,
65+
capture_output=True,
66+
text=True,
67+
)
68+
except subprocess.CalledProcessError as e:
69+
logger.error(f"Failed to clone repository: {e.stderr}")
70+
raise RuntimeError(f"Failed to clone {repo_identifier}: {e.stderr}")
71+
72+
return str(repo_path)

0 commit comments

Comments
 (0)