@@ -61,39 +61,61 @@ def process_repo(self, repo_path: str | Path) -> Tuple[List[Dict[str, Any]], str
61
61
# Ingest repository
62
62
summary , tree , content = ingest (str (repo_path ))
63
63
64
+ # Calculate token count based on content type
65
+ def estimate_tokens (content : Any ) -> int :
66
+ if isinstance (content , dict ):
67
+ # If content is a dictionary of file contents
68
+ return int (sum (len (str (c ).split ()) for c in content .values ()) * 1.3 )
69
+ elif isinstance (content , str ):
70
+ # If content is a single string
71
+ return int (len (content .split ()) * 1.3 )
72
+ else :
73
+ # If content is in another format, return 0
74
+ return 0
75
+
64
76
# Print formatted repository information
65
77
if isinstance (summary , dict ):
66
78
repo_name = summary .get ("name" , "Unknown" )
67
79
file_count = len (tree ) if tree else 0
68
- token_count = sum (len (str (c ).split ()) for c in content .values ()) * 1.3 # Rough estimate
69
-
70
- print ("\n Repository Information:" )
71
- print ("-" * 50 )
72
- print (f"📦 Repository: { repo_name } " )
73
- print (f"📄 Files analyzed: { file_count } " )
74
- print (f"🔤 Estimated tokens: { int (token_count ):,} " )
75
80
else :
76
- print ("\n Repository Information:" )
77
- print ("-" * 50 )
78
- print (f"📦 Repository: { repo_path } " )
79
- print (f"📄 Files analyzed: { len (tree ) if tree else 0 } " )
80
- print (f"🔤 Estimated tokens: { int (sum (len (str (c ).split ()) for c in content .values ()) * 1.3 ):,} " )
81
+ repo_name = str (repo_path ).split ('/' )[- 1 ]
82
+ file_count = len (tree ) if tree else 0
83
+
84
+ token_count = estimate_tokens (content )
85
+
86
+ print ("\n Repository Information:" )
87
+ print ("-" * 50 )
88
+ print (f"📦 Repository: { repo_name } " )
89
+ print (f"📄 Files analyzed: { file_count } " )
90
+ print (f"🔤 Estimated tokens: { token_count :,} " )
81
91
82
92
# Extract metadata
83
93
metadata = self ._extract_metadata (summary , tree )
84
94
85
95
# Process content into chunks
86
96
processed_chunks = []
87
- for file_path , file_content in content .items ():
88
- # Skip if content is not a string
89
- if not isinstance (file_content , str ):
90
- continue
91
-
97
+
98
+ if isinstance (content , dict ):
99
+ # Handle dictionary of file contents
100
+ for file_path , file_content in content .items ():
101
+ if isinstance (file_content , str ):
102
+ chunk = {
103
+ "text" : file_content ,
104
+ "metadata" : {
105
+ ** metadata ,
106
+ "file_path" : file_path ,
107
+ "source" : str (repo_path ),
108
+ "document_id" : document_id
109
+ }
110
+ }
111
+ processed_chunks .append (chunk )
112
+ elif isinstance (content , str ):
113
+ # Handle single string content
92
114
chunk = {
93
- "text" : file_content ,
115
+ "text" : content ,
94
116
"metadata" : {
95
117
** metadata ,
96
- "file_path" : file_path ,
118
+ "file_path" : "repository_content.txt" ,
97
119
"source" : str (repo_path ),
98
120
"document_id" : document_id
99
121
}
0 commit comments