Skip to content

Commit 7e1628b

Browse files
committed
fix: improve content type handling in repo processor
1 parent a6052b0 commit 7e1628b

File tree

1 file changed

+41
-19
lines changed

1 file changed

+41
-19
lines changed

agentic_rag/repo_processor.py

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -61,39 +61,61 @@ def process_repo(self, repo_path: str | Path) -> Tuple[List[Dict[str, Any]], str
6161
# Ingest repository
6262
summary, tree, content = ingest(str(repo_path))
6363

64+
# Calculate token count based on content type
65+
def estimate_tokens(content: Any) -> int:
66+
if isinstance(content, dict):
67+
# If content is a dictionary of file contents
68+
return int(sum(len(str(c).split()) for c in content.values()) * 1.3)
69+
elif isinstance(content, str):
70+
# If content is a single string
71+
return int(len(content.split()) * 1.3)
72+
else:
73+
# If content is in another format, return 0
74+
return 0
75+
6476
# Print formatted repository information
6577
if isinstance(summary, dict):
6678
repo_name = summary.get("name", "Unknown")
6779
file_count = len(tree) if tree else 0
68-
token_count = sum(len(str(c).split()) for c in content.values()) * 1.3 # Rough estimate
69-
70-
print("\nRepository Information:")
71-
print("-" * 50)
72-
print(f"📦 Repository: {repo_name}")
73-
print(f"📄 Files analyzed: {file_count}")
74-
print(f"🔤 Estimated tokens: {int(token_count):,}")
7580
else:
76-
print("\nRepository Information:")
77-
print("-" * 50)
78-
print(f"📦 Repository: {repo_path}")
79-
print(f"📄 Files analyzed: {len(tree) if tree else 0}")
80-
print(f"🔤 Estimated tokens: {int(sum(len(str(c).split()) for c in content.values()) * 1.3):,}")
81+
repo_name = str(repo_path).split('/')[-1]
82+
file_count = len(tree) if tree else 0
83+
84+
token_count = estimate_tokens(content)
85+
86+
print("\nRepository Information:")
87+
print("-" * 50)
88+
print(f"📦 Repository: {repo_name}")
89+
print(f"📄 Files analyzed: {file_count}")
90+
print(f"🔤 Estimated tokens: {token_count:,}")
8191

8292
# Extract metadata
8393
metadata = self._extract_metadata(summary, tree)
8494

8595
# Process content into chunks
8696
processed_chunks = []
87-
for file_path, file_content in content.items():
88-
# Skip if content is not a string
89-
if not isinstance(file_content, str):
90-
continue
91-
97+
98+
if isinstance(content, dict):
99+
# Handle dictionary of file contents
100+
for file_path, file_content in content.items():
101+
if isinstance(file_content, str):
102+
chunk = {
103+
"text": file_content,
104+
"metadata": {
105+
**metadata,
106+
"file_path": file_path,
107+
"source": str(repo_path),
108+
"document_id": document_id
109+
}
110+
}
111+
processed_chunks.append(chunk)
112+
elif isinstance(content, str):
113+
# Handle single string content
92114
chunk = {
93-
"text": file_content,
115+
"text": content,
94116
"metadata": {
95117
**metadata,
96-
"file_path": file_path,
118+
"file_path": "repository_content.txt",
97119
"source": str(repo_path),
98120
"document_id": document_id
99121
}

0 commit comments

Comments
 (0)