Skip to content

Commit 3ac37aa

Browse files
feat: Add Java support and enhance Python analyzer
- Add `tree-sitter-java` dependency. - Implement `JavaParser` in `codesage/analyzers/java_parser.py` with AST extraction (classes, methods, imports) and complexity calculation. - Implement `JavaSemanticSnapshotBuilder` in `codesage/semantic_digest/java_snapshot_builder.py` supporting package-aware FQN and annotation tags. - Update `codesage/analyzers/parser_factory.py` to register `JavaParser`. - Update `codesage/config/defaults.py` with Java file extensions and ignore paths. - Enhance `PythonParser` in `codesage/analyzers/python_parser.py`: - Extract class attributes (fields). - Add line numbers to import nodes. - Fix type extraction for annotated assignments. - Update `codesage/analyzers/ast_models.py`: - Add `fields` to `ClassNode`. - Add `lineno` to `ImportNode`. - Enhance `PythonSemanticSnapshotBuilder` to support code sampling for high-complexity functions. - Fix metric calculation to avoid double-counting methods in Java snapshots.
1 parent 91dbd1f commit 3ac37aa

File tree

9 files changed

+609
-14
lines changed

9 files changed

+609
-14
lines changed

codesage/analyzers/ast_models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ class ImportNode(ASTNode):
4141
path: str
4242
alias: Optional[str] = None
4343
is_relative: bool = False
44+
lineno: int = 0
4445

4546
class FileAST(BaseModel):
4647
path: str

codesage/analyzers/java_parser.py

Lines changed: 317 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,317 @@
1+
from tree_sitter import Language, Parser, Node
2+
import tree_sitter_java as tsjava
3+
from codesage.analyzers.base import BaseParser
4+
from codesage.analyzers.ast_models import FunctionNode, ClassNode, ImportNode, VariableNode
5+
from codesage.snapshot.models import ASTSummary, ComplexityMetrics
6+
from typing import List, Set
7+
8+
JAVA_COMPLEXITY_NODES = {
9+
"if_statement",
10+
"for_statement",
11+
"enhanced_for_statement",
12+
"while_statement",
13+
"do_statement",
14+
"switch_expression",
15+
"catch_clause",
16+
"throw_statement",
17+
"return_statement",
18+
"conditional_expression", # ternary
19+
"case_label", # switch case
20+
}
21+
22+
SEMANTIC_TAGS_RULES = {
23+
"execute": "db_op",
24+
"executeQuery": "db_op",
25+
"executeUpdate": "db_op",
26+
"save": "db_op",
27+
"delete": "db_op",
28+
"findById": "db_op",
29+
"persist": "db_op",
30+
"merge": "db_op",
31+
32+
"send": "network",
33+
"connect": "network",
34+
"openStream": "network",
35+
36+
"read": "file_io",
37+
"write": "file_io",
38+
"readAllBytes": "file_io",
39+
"lines": "file_io",
40+
41+
"println": "io_op",
42+
"print": "io_op",
43+
"readLine": "io_op",
44+
}
45+
46+
ANNOTATION_TAGS = {
47+
"GetMapping": "network",
48+
"PostMapping": "network",
49+
"PutMapping": "network",
50+
"DeleteMapping": "network",
51+
"RequestMapping": "network",
52+
"PatchMapping": "network",
53+
"Entity": "db_op",
54+
"Table": "db_op",
55+
"Repository": "db_op",
56+
"Service": "service",
57+
"Controller": "controller",
58+
"RestController": "controller",
59+
"Component": "component",
60+
"Configuration": "config",
61+
"Bean": "config",
62+
}
63+
64+
class JavaParser(BaseParser):
65+
def __init__(self):
66+
super().__init__()
67+
try:
68+
java_language = Language(tsjava.language())
69+
self.parser = Parser(java_language)
70+
except Exception as e:
71+
# Fallback or error handling if needed, but for now let it crash if dependencies are wrong
72+
raise e
73+
74+
def _parse(self, source_code: bytes):
75+
return self.parser.parse(source_code)
76+
77+
def extract_functions(self) -> List[FunctionNode]:
78+
functions = []
79+
if not self.tree:
80+
return functions
81+
82+
for node in self._walk(self.tree.root_node):
83+
if node.type in ("method_declaration", "constructor_declaration"):
84+
functions.append(self._build_function_node(node))
85+
86+
return functions
87+
88+
def extract_classes(self) -> List[ClassNode]:
89+
classes = []
90+
if not self.tree:
91+
return classes
92+
93+
for node in self._walk(self.tree.root_node):
94+
if node.type in ("class_declaration", "interface_declaration", "record_declaration", "enum_declaration"):
95+
name_node = node.child_by_field_name("name")
96+
name = self._text(name_node) if name_node else ''
97+
98+
methods = []
99+
body = node.child_by_field_name("body")
100+
if body:
101+
for child in body.children:
102+
if child.type in ("method_declaration", "constructor_declaration"):
103+
methods.append(self._build_function_node(child))
104+
105+
base_classes = []
106+
# Superclass
107+
superclass = node.child_by_field_name("superclass")
108+
if superclass:
109+
# The superclass node covers 'extends BaseClass', we just want 'BaseClass'
110+
# It usually contains a type_identifier or generic_type
111+
for child in superclass.children:
112+
if child.type in ("type_identifier", "generic_type", "scoped_identifier"):
113+
base_classes.append(self._text(child))
114+
115+
# Interfaces
116+
interfaces = node.child_by_field_name("interfaces")
117+
if interfaces:
118+
# (interfaces (type_list (type_identifier)...))
119+
for child in self._walk(interfaces):
120+
if child.type in ("type_identifier", "generic_type", "scoped_identifier"):
121+
base_classes.append(self._text(child))
122+
123+
# Check modifiers for public/private
124+
modifiers_node = node.child_by_field_name("modifiers")
125+
is_exported = False # Default package private
126+
tags = set()
127+
if modifiers_node:
128+
for child in modifiers_node.children:
129+
if child.type == "public" or child.type == "protected":
130+
is_exported = True
131+
# If no modifier, it's package-private, which is sort of exported to package.
132+
# But typically 'public' is what we consider exported in libraries.
133+
# Let's stick to public/protected as exported.
134+
135+
# Extract class annotations
136+
decorators = self._get_annotations(modifiers_node)
137+
for ann in decorators:
138+
ann_name = ann.replace("@", "").split("(")[0]
139+
if ann_name in ANNOTATION_TAGS:
140+
tags.add(ANNOTATION_TAGS[ann_name])
141+
142+
classes.append(ClassNode(
143+
node_type="class",
144+
name=name,
145+
methods=methods,
146+
base_classes=base_classes,
147+
is_exported=is_exported,
148+
tags=tags
149+
))
150+
return classes
151+
152+
def extract_package(self) -> str:
153+
if not self.tree:
154+
return ""
155+
156+
for node in self._walk(self.tree.root_node):
157+
if node.type == "package_declaration":
158+
# (package_declaration (scoped_identifier) ...)
159+
for child in node.children:
160+
if child.type in ("dotted_name", "scoped_identifier", "identifier"):
161+
return self._text(child)
162+
return ""
163+
164+
def extract_imports(self) -> List[ImportNode]:
165+
imports = []
166+
if not self.tree:
167+
return imports
168+
169+
for node in self._walk(self.tree.root_node):
170+
if node.type == "import_declaration":
171+
# import_declaration usually contains dotted_name
172+
# (import_declaration (dotted_name) @name)
173+
# or (import_declaration (scoped_identifier) ...) for static imports
174+
# tree-sitter-java:
175+
# (import_declaration (identifier)) ??
176+
# Let's inspect children.
177+
178+
path = ""
179+
static_import = False
180+
for child in node.children:
181+
if child.type == "static":
182+
static_import = True
183+
if child.type in ("dotted_name", "scoped_identifier", "identifier"):
184+
path = self._text(child)
185+
186+
# Check for wildcard .*
187+
if self._text(node).strip().endswith(".*"):
188+
path += ".*" # Rough approximation if not captured in path
189+
190+
imports.append(ImportNode(
191+
node_type="import",
192+
path=path,
193+
alias=None, # Java doesn't do 'as' aliases in imports
194+
is_relative=False
195+
))
196+
return imports
197+
198+
# Java doesn't have standalone global variables in the same way Python does,
199+
# they are usually static fields in classes. We could extract those if needed,
200+
# but BaseParser doesn't mandate extract_variables (it's in PythonParser).
201+
# I'll skip it unless required. The plan mentioned extract_classes, extract_functions, extract_imports.
202+
203+
def _build_function_node(self, func_node):
204+
name_node = func_node.child_by_field_name("name")
205+
name = self._text(name_node) if name_node else ''
206+
if func_node.type == "constructor_declaration":
207+
# Constructor name matches class name, usually available as name field
208+
pass
209+
210+
params_node = func_node.child_by_field_name("parameters")
211+
return_type_node = func_node.child_by_field_name("type") # return type
212+
213+
modifiers_node = func_node.child_by_field_name("modifiers")
214+
decorators = self._get_annotations(modifiers_node)
215+
216+
return_type = None
217+
if return_type_node:
218+
return_type = self._text(return_type_node)
219+
elif func_node.type == "constructor_declaration":
220+
return_type = "void" # Or class name
221+
222+
# Analyze function body for tags
223+
tags = self._extract_tags(func_node)
224+
225+
# Add tags from annotations
226+
for ann in decorators:
227+
# Extract annotation name: @Override -> Override
228+
ann_name = ann.replace("@", "").split("(")[0]
229+
if ann_name in ANNOTATION_TAGS:
230+
tags.add(ANNOTATION_TAGS[ann_name])
231+
232+
is_exported = False
233+
if modifiers_node:
234+
for child in modifiers_node.children:
235+
if child.type == "public" or child.type == "protected":
236+
is_exported = True
237+
238+
return FunctionNode(
239+
node_type="function",
240+
name=name,
241+
params=[self._text(param) for param in params_node.children if param.type == "formal_parameter"] if params_node else [],
242+
return_type=return_type,
243+
start_line=func_node.start_point[0],
244+
end_line=func_node.end_point[0],
245+
complexity=self.calculate_complexity(func_node),
246+
is_async=False, # Java threads aren't async/await syntax usually
247+
decorators=decorators,
248+
tags=tags,
249+
is_exported=is_exported
250+
)
251+
252+
def _extract_tags(self, node: Node) -> Set[str]:
253+
tags = set()
254+
for child in self._walk(node):
255+
if child.type == "method_invocation":
256+
name_node = child.child_by_field_name("name")
257+
if name_node:
258+
method_name = self._text(name_node)
259+
if method_name in SEMANTIC_TAGS_RULES:
260+
tags.add(SEMANTIC_TAGS_RULES[method_name])
261+
return tags
262+
263+
def _get_annotations(self, modifiers_node):
264+
if not modifiers_node:
265+
return []
266+
267+
annotations = []
268+
for child in modifiers_node.children:
269+
if child.type in ("marker_annotation", "annotation", "modifiers"): # 'modifiers' shouldn't be child of modifiers
270+
# Check for annotation types
271+
if "annotation" in child.type:
272+
annotations.append(self._text(child))
273+
return annotations
274+
275+
def calculate_complexity(self, node: Node) -> int:
276+
complexity = 1
277+
278+
for child in self._walk(node):
279+
if child.type in JAVA_COMPLEXITY_NODES:
280+
complexity += 1
281+
elif child.type == "binary_expression":
282+
operator = child.child_by_field_name("operator")
283+
if operator and self._text(operator) in ("&&", "||"):
284+
complexity += 1
285+
286+
return complexity
287+
288+
def get_ast_summary(self, source_code: str) -> ASTSummary:
289+
self.parse(source_code)
290+
return ASTSummary(
291+
function_count=len(self.extract_functions()),
292+
class_count=len(self.extract_classes()),
293+
import_count=len(self.extract_imports()),
294+
comment_lines=self._count_comment_lines()
295+
)
296+
297+
def _count_comment_lines(self) -> int:
298+
if not self.tree:
299+
return 0
300+
301+
comment_lines = set()
302+
for node in self._walk(self.tree.root_node):
303+
if node.type in ('line_comment', 'block_comment'):
304+
start_line = node.start_point[0]
305+
end_line = node.end_point[0]
306+
for i in range(start_line, end_line + 1):
307+
comment_lines.add(i)
308+
return len(comment_lines)
309+
310+
def get_complexity_metrics(self, source_code: str) -> ComplexityMetrics:
311+
self.parse(source_code)
312+
if not self.tree:
313+
return ComplexityMetrics(cyclomatic=0)
314+
315+
return ComplexityMetrics(
316+
cyclomatic=self.calculate_complexity(self.tree.root_node)
317+
)

codesage/analyzers/parser_factory.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
from codesage.analyzers.base import BaseParser
22
from codesage.analyzers.go_parser import GoParser
33
from codesage.analyzers.python_parser import PythonParser
4+
from codesage.analyzers.java_parser import JavaParser
45

56
PARSERS = {
67
"go": GoParser,
78
"python": PythonParser,
9+
"java": JavaParser,
810
}
911

1012
def create_parser(language: str) -> BaseParser:

0 commit comments

Comments
 (0)