Skip to content

Commit f1de3d4

Browse files
committed
Merge branch 'main' into fix-testcases
2 parents ed87d83 + 843d04f commit f1de3d4

File tree

8 files changed

+176
-21
lines changed

8 files changed

+176
-21
lines changed

.github/workflows/python-publish.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ jobs:
4141
CODE_ANALYZER_URL=$(curl -s https://api.github.com/repos/IBM/codenet-minerva-code-analyzer/releases/latest | jq -r '.assets[] | .browser_download_url')
4242
echo "Downloading: " $CODE_ANALYZER_URL
4343
wget -q $CODE_ANALYZER_URL
44-
echo "Moving codeanalyzer.jar to:" ${{ github.workspace }}/cldk/analysis/java/codeanalyzer/jar/codeanalyzer.jar
45-
mv codeanalyzer.jar ${{ github.workspace }}/cldk/analysis/java/codeanalyzer/jar/codeanalyzer.jar
44+
echo "Moving codeanalyzer jar to:" ${{ github.workspace }}/cldk/analysis/java/codeanalyzer/jar/
45+
mv codeanalyzer-*.jar ${{ github.workspace }}/cldk/analysis/java/codeanalyzer/jar/
4646
4747
- name: Build package
4848
run: poetry build

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
[![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](https://opensource.org/licenses/Apache-2.0)
77
[![Documentation](https://img.shields.io/badge/GitHub%20Pages-Docs-blue)](https://ibm.github.io/codellm-devkit/)
88
[![PyPI version](https://badge.fury.io/py/cldk.svg)](https://badge.fury.io/py/cldk)
9+
[![arXiv](https://img.shields.io/badge/arXiv-2410.13007-b31b1b.svg)](https://arxiv.org/abs/2410.13007)
10+
911

1012
Codellm-devkit (CLDK) is a multilingual program analysis framework that bridges the gap between traditional static analysis tools and Large Language Models (LLMs) specialized for code (CodeLLMs). Codellm-devkit allows developers to streamline the process of transforming raw code into actionable insights by providing a unified interface for integrating outputs from various analysis tools and preparing them for effective use by CodeLLMs.
1113

@@ -272,5 +274,6 @@ if __name__ == "__main__":
272274
```
273275
274276
### Publication (papers and blogs related to CLDK)
275-
1. Pan, Rangeet, Myeongsoo Kim, Rahul Krishna, Raju Pavuluri, and Saurabh Sinha. "[Multi-language Unit Test Generation using LLMs.](https://arxiv.org/abs/2409.03093)" arXiv preprint arXiv:2409.03093 (2024).
276-
2. Pan, Rangeet, Rahul Krishna, Raju Pavuluri, Saurabh Sinha, and Maja Vukovic., "[Simplify your Code LLM solutions using CodeLLM Dev Kit (CLDK).](https://www.linkedin.com/pulse/simplify-your-code-llm-solutions-using-codellm-dev-kit-rangeet-pan-vnnpe/?trackingId=kZ3U6d8GSDCs8S1oApXZgg%3D%3D)", Blog.
277+
1. Krishna, Rahul, Rangeet Pan, Raju Pavuluri, Srikanth Tamilselvam, Maja Vukovic, and Saurabh Sinha. "[Codellm-Devkit: A Framework for Contextualizing Code LLMs with Program Analysis Insights.](https://arxiv.org/pdf/2410.13007)" arXiv preprint arXiv:2410.13007 (2024).
278+
2. Pan, Rangeet, Myeongsoo Kim, Rahul Krishna, Raju Pavuluri, and Saurabh Sinha. "[Multi-language Unit Test Generation using LLMs.](https://arxiv.org/abs/2409.03093)" arXiv preprint arXiv:2409.03093 (2024).
279+
3. Pan, Rangeet, Rahul Krishna, Raju Pavuluri, Saurabh Sinha, and Maja Vukovic., "[Simplify your Code LLM solutions using CodeLLM Dev Kit (CLDK).](https://www.linkedin.com/pulse/simplify-your-code-llm-solutions-using-codellm-dev-kit-rangeet-pan-vnnpe/?trackingId=kZ3U6d8GSDCs8S1oApXZgg%3D%3D)", Blog.

cldk/analysis/java/java.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from networkx import DiGraph
2525

2626
from cldk.analysis import SymbolTable, CallGraph, AnalysisLevel
27+
from cldk.analysis.java.treesitter import JavaSitter
2728
from cldk.models.java import JCallable
2829
from cldk.models.java import JApplication
2930
from cldk.models.java.models import JCompilationUnit, JMethodDetail, JType, JField
@@ -175,6 +176,28 @@ def get_class_hierarchy(self) -> DiGraph:
175176
raise NotImplementedError(f"Support for this functionality has not been implemented yet.")
176177
raise NotImplementedError("Class hierarchy is not implemented yet.")
177178

179+
def is_parsable(self, source_code: str) -> bool:
180+
"""
181+
Check if the code is parsable
182+
Args:
183+
source_code: source code
184+
185+
Returns:
186+
True if the code is parsable, False otherwise
187+
"""
188+
return JavaSitter.is_parsable(self, source_code)
189+
190+
def get_raw_ast(self, source_code: str) -> str:
191+
"""
192+
Get the raw AST
193+
Args:
194+
code: source code
195+
196+
Returns:
197+
Tree: the raw AST
198+
"""
199+
return JavaSitter.get_raw_ast(self, source_code)
200+
178201
def get_call_graph(self) -> DiGraph:
179202
"""Returns the call graph of the Java code.
180203

cldk/analysis/java/treesitter/javasitter.py

Lines changed: 64 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
from itertools import groupby
2222
from typing import List, Set, Dict
23-
23+
from tree_sitter import Language, Node, Parser, Query, Tree
2424
import tree_sitter_java as tsjava
2525
from tree_sitter import Language, Node, Parser, Query
2626

@@ -51,10 +51,49 @@ def method_is_not_in_class(self, method_name: str, class_body: str) -> bool:
5151
bool
5252
True if the method is in the class, False otherwise.
5353
"""
54-
methods_in_class = self.frame_query_and_capture_output("(method_declaration name: (identifier) @name)", class_body)
54+
methods_in_class = self.frame_query_and_capture_output("(method_declaration name: (identifier) @name)",
55+
class_body)
5556

5657
return method_name not in {method.node.text.decode() for method in methods_in_class}
5758

59+
def is_parsable(self, code: str) -> bool:
60+
"""
61+
Check if the code is parsable
62+
Args:
63+
code: source code
64+
65+
Returns:
66+
True if the code is parsable, False otherwise
67+
"""
68+
69+
def syntax_error(node):
70+
if node.type == "ERROR":
71+
return True
72+
try:
73+
for child in node.children:
74+
if syntax_error(child):
75+
return True
76+
except RecursionError as err:
77+
return True
78+
79+
return False
80+
81+
tree = self.parser.parse(bytes(code, "utf-8"))
82+
if tree is not None:
83+
return not syntax_error(tree.root_node)
84+
return False
85+
86+
def get_raw_ast(self, code: str) -> Tree:
87+
"""
88+
Get the raw AST
89+
Args:
90+
code: source code
91+
92+
Returns:
93+
Tree: the raw AST
94+
"""
95+
return self.parser.parse(bytes(code, "utf-8"))
96+
5897
def get_all_imports(self, source_code: str) -> Set[str]:
5998
"""Get a list of all the imports in a class.
6099
@@ -64,7 +103,8 @@ def get_all_imports(self, source_code: str) -> Set[str]:
64103
Returns:
65104
Set[str]: A set of all the imports in the class.
66105
"""
67-
import_declerations: Captures = self.frame_query_and_capture_output(query="(import_declaration (scoped_identifier) @name)", code_to_process=source_code)
106+
import_declerations: Captures = self.frame_query_and_capture_output(
107+
query="(import_declaration (scoped_identifier) @name)", code_to_process=source_code)
68108
return {capture.node.text.decode() for capture in import_declerations}
69109

70110
def get_pacakge_name(self, source_code: str) -> str:
@@ -76,7 +116,8 @@ def get_pacakge_name(self, source_code: str) -> str:
76116
Returns:
77117
str: The package name.
78118
"""
79-
package_name: Captures = self.frame_query_and_capture_output(query="((package_declaration) @name)", code_to_process=source_code)
119+
package_name: Captures = self.frame_query_and_capture_output(query="((package_declaration) @name)",
120+
code_to_process=source_code)
80121
if package_name:
81122
return package_name[0].node.text.decode().replace("package ", "").replace(";", "")
82123
return None
@@ -102,7 +143,8 @@ def get_superclass(self, source_code: str) -> str:
102143
Returns:
103144
Set[str]: A set of all the superclasses in the class.
104145
"""
105-
superclass: Captures = self.frame_query_and_capture_output(query="(class_declaration (superclass (type_identifier) @superclass))", code_to_process=source_code)
146+
superclass: Captures = self.frame_query_and_capture_output(
147+
query="(class_declaration (superclass (type_identifier) @superclass))", code_to_process=source_code)
106148

107149
if len(superclass) == 0:
108150
return ""
@@ -119,7 +161,9 @@ def get_all_interfaces(self, source_code: str) -> Set[str]:
119161
Set[str]: A set of all the interfaces implemented by the class.
120162
"""
121163

122-
interfaces = self.frame_query_and_capture_output("(class_declaration (super_interfaces (type_list (type_identifier) @interface)))", code_to_process=source_code)
164+
interfaces = self.frame_query_and_capture_output(
165+
"(class_declaration (super_interfaces (type_list (type_identifier) @interface)))",
166+
code_to_process=source_code)
123167
return {interface.node.text.decode() for interface in interfaces}
124168

125169
def frame_query_and_capture_output(self, query: str, code_to_process: str) -> Captures:
@@ -138,7 +182,8 @@ def frame_query_and_capture_output(self, query: str, code_to_process: str) -> Ca
138182

139183
def get_method_name_from_declaration(self, method_name_string: str) -> str:
140184
"""Get the method name from the method signature."""
141-
captures: Captures = self.frame_query_and_capture_output("(method_declaration name: (identifier) @method_name)", method_name_string)
185+
captures: Captures = self.frame_query_and_capture_output("(method_declaration name: (identifier) @method_name)",
186+
method_name_string)
142187

143188
return captures[0].node.text.decode()
144189

@@ -147,7 +192,8 @@ def get_method_name_from_invocation(self, method_invocation: str) -> str:
147192
Using the tree-sitter query, extract the method name from the method invocation.
148193
"""
149194

150-
captures: Captures = self.frame_query_and_capture_output("(method_invocation object: (identifier) @class_name name: (identifier) @method_name)", method_invocation)
195+
captures: Captures = self.frame_query_and_capture_output(
196+
"(method_invocation object: (identifier) @class_name name: (identifier) @method_name)", method_invocation)
151197
return captures[0].node.text.decode()
152198

153199
def safe_ascend(self, node: Node, ascend_count: int) -> Node:
@@ -352,7 +398,8 @@ def get_method_return_type(self, source_code: str) -> str:
352398
The return type of the method.
353399
"""
354400

355-
type_references: Captures = self.frame_query_and_capture_output("(method_declaration type: ((type_identifier) @type_id))", source_code)
401+
type_references: Captures = self.frame_query_and_capture_output(
402+
"(method_declaration type: ((type_identifier) @type_id))", source_code)
356403

357404
return type_references[0].node.text.decode()
358405

@@ -379,9 +426,9 @@ def collect_leaf_token_values(node):
379426
if len(node.children) == 0:
380427
if filter_by_node_type is not None:
381428
if node.type in filter_by_node_type:
382-
lexical_tokens.append(code[node.start_byte : node.end_byte])
429+
lexical_tokens.append(code[node.start_byte: node.end_byte])
383430
else:
384-
lexical_tokens.append(code[node.start_byte : node.end_byte])
431+
lexical_tokens.append(code[node.start_byte: node.end_byte])
385432
else:
386433
for child in node.children:
387434
collect_leaf_token_values(child)
@@ -415,9 +462,11 @@ def remove_all_comments(self, source_code: str) -> str:
415462
pruned_source_code = self.make_pruned_code_prettier(source_code)
416463

417464
# Remove all comment lines: the comment lines start with / (for // and /*) or * (for multiline comments).
418-
comment_blocks: Captures = self.frame_query_and_capture_output(query="((block_comment) @comment_block)", code_to_process=source_code)
465+
comment_blocks: Captures = self.frame_query_and_capture_output(query="((block_comment) @comment_block)",
466+
code_to_process=source_code)
419467

420-
comment_lines: Captures = self.frame_query_and_capture_output(query="((line_comment) @comment_line)", code_to_process=source_code)
468+
comment_lines: Captures = self.frame_query_and_capture_output(query="((line_comment) @comment_line)",
469+
code_to_process=source_code)
421470

422471
for capture in comment_blocks:
423472
pruned_source_code = pruned_source_code.replace(capture.node.text.decode(), "")
@@ -441,7 +490,8 @@ def make_pruned_code_prettier(self, pruned_code: str) -> str:
441490
The prettified pruned code.
442491
"""
443492
# First remove remaining block comments
444-
block_comments: Captures = self.frame_query_and_capture_output(query="((block_comment) @comment_block)", code_to_process=pruned_code)
493+
block_comments: Captures = self.frame_query_and_capture_output(query="((block_comment) @comment_block)",
494+
code_to_process=pruned_code)
445495

446496
for capture in block_comments:
447497
pruned_code = pruned_code.replace(capture.node.text.decode(), "")

cldk/analysis/python/python.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,29 @@ def get_method_details(self, method_signature: str) -> PyMethod:
8787
"""
8888
return self.analysis_backend.get_method_details(self.source_code, method_signature)
8989

90-
def get_imports(self) -> List[PyImport]:
90+
def is_parsable(self, source_code: str) -> bool:
91+
"""
92+
Check if the code is parsable
93+
Args:
94+
source_code: source code
95+
96+
Returns:
97+
True if the code is parsable, False otherwise
98+
"""
99+
return PythonSitter.is_parsable(self, source_code)
100+
101+
def get_raw_ast(self, source_code: str) -> str:
102+
"""
103+
Get the raw AST
104+
Args:
105+
code: source code
106+
107+
Returns:
108+
Tree: the raw AST
109+
"""
110+
return PythonSitter.get_raw_ast(self, source_code)
111+
112+
def get_imports(self) -> List[PyImport]:
91113
"""
92114
Given an application or a source code, get all the imports
93115
"""

cldk/analysis/python/treesitter/python_sitter.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,8 @@
2323
from pathlib import Path
2424
from typing import List
2525

26-
from tree_sitter import Language, Parser, Query, Node
26+
from tree_sitter import Language, Parser, Query, Node, Tree
2727
import tree_sitter_python as tspython
28-
2928
from cldk.models.python.models import PyMethod, PyClass, PyArg, PyImport, PyModule, PyCallSite
3029
from cldk.models.treesitter import Captures
3130
from cldk.utils.treesitter.tree_sitter_utils import TreeSitterUtils
@@ -41,6 +40,44 @@ def __init__(self) -> None:
4140
self.parser: Parser = Parser(self.language)
4241
self.utils: TreeSitterUtils = TreeSitterUtils()
4342

43+
def is_parsable(self, code: str) -> bool:
44+
"""
45+
Check if the code is parsable
46+
Args:
47+
code: source code
48+
49+
Returns:
50+
True if the code is parsable, False otherwise
51+
"""
52+
def syntax_error(node):
53+
if node.type == "ERROR":
54+
return True
55+
try:
56+
for child in node.children:
57+
if syntax_error(child):
58+
return True
59+
except RecursionError as err:
60+
print(err)
61+
return True
62+
63+
return False
64+
65+
tree = self.parser.parse(bytes(code, "utf-8"))
66+
if tree is not None:
67+
return not syntax_error(tree.root_node)
68+
return False
69+
70+
def get_raw_ast(self, code: str) -> Tree:
71+
"""
72+
Get the raw AST
73+
Args:
74+
code: source code
75+
76+
Returns:
77+
Tree: the raw AST
78+
"""
79+
return self.parser.parse(bytes(code, "utf-8"))
80+
4481
def get_all_methods(self, module: str) -> List[PyMethod]:
4582
"""
4683
Get all the methods in the specific module.

cldk/analysis/symbol_table.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ def __init__(self) -> None:
2929
Language agnostic functions
3030
"""
3131

32+
@abstractmethod
33+
def is_parsable(self, **kwargs):
34+
"""
35+
Given a full code or a snippet, returns whether code is in right structure or hence parsable
36+
"""
37+
pass
38+
3239
@abstractmethod
3340
def get_methods(self, **kwargs):
3441
"""

tests/tree_sitter/python/test_python_tree_sitter.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,19 @@ def setUp(self):
1414

1515
def tearDown(self):
1616
"""Runs after each test case"""
17+
def test_is_parasable(self):
18+
module_str = """
19+
@staticmethod
20+
def foo() -> None:
21+
pass
22+
class Person:
23+
def __init__(self, name: str, age: int):
24+
self.name = name
25+
self.age = age
26+
@staticmethod
27+
def __str__(self):"
28+
"""
29+
self.assertFalse(self.python_tree_sitter.is_parsable(module_str))
1730

1831
def test_get_all_methods(self):
1932
module_str = """

0 commit comments

Comments
 (0)