Skip to content

Commit fe2a7da

Browse files
authored
Java transitive search (#130)
* Java transitive search Signed-off-by: Theodor Mihalache <tmihalac@tmihalac-thinkpadp1gen7.rmtusfl.csb>
1 parent daf988c commit fe2a7da

23 files changed

+8566
-413
lines changed

.tekton/on-pull-request.yaml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,23 @@ spec:
178178
print_banner "INSTALLING DEPENDENCIES"
179179
uv sync
180180
181+
# Install Java
182+
JAVA_ARCH="x64"
183+
JDK_URL="https://github.com/adoptium/temurin22-binaries/releases/download/jdk-22.0.2%2B9/OpenJDK22U-jdk_${JAVA_ARCH}_linux_hotspot_22.0.2_9.tar.gz"
184+
JDK_DIR="jdk-22.0.2+9"
185+
186+
echo ">> Downloading $JDK_URL"
187+
mkdir -p /tekton/home/jdk
188+
curl -fsSL -o /tekton/home/jdk/jdk.tgz "$JDK_URL"
189+
tar -C /tekton/home/jdk -xzf /tekton/home/jdk/jdk.tgz
190+
rm -f /tekton/home/jdk/jdk.tgz
191+
192+
export JAVA_HOME="/tekton/home/jdk/${JDK_DIR}"
193+
export PATH="$JAVA_HOME/bin:$PATH"
194+
195+
echo "Java version:"
196+
java -version || true
197+
181198
# Install Go
182199
print_banner "Installing Go"
183200

@@ -190,6 +207,24 @@ spec:
190207
echo "Go version:"
191208
go version
192209

210+
# Install Maven
211+
print_banner "Installing Maven"
212+
213+
MAVEN_VERSION="3.9.11"
214+
ARCHIVE="apache-maven-${MAVEN_VERSION}-bin.tar.gz"
215+
URL="https://archive.apache.org/dist/maven/maven-3/${MAVEN_VERSION}/binaries/${ARCHIVE}"
216+
217+
curl -s -L -o "${ARCHIVE}" "${URL}"
218+
mkdir -p "$HOME/maven-sdk"
219+
tar -C "$HOME/maven-sdk" -xzf "${ARCHIVE}"
220+
221+
export MAVEN_HOME="$HOME/maven-sdk/apache-maven-${MAVEN_VERSION}"
222+
export M2_HOME="$MAVEN_HOME"
223+
export PATH="$MAVEN_HOME/bin:$PATH"
224+
225+
echo "Maven version:"
226+
mvn -v
227+
193228
print_banner "RUNNING LINTER"
194229
# Add the current directory to git's safe directories to avoid ownership errors.
195230
git config --global --add safe.directory /workspace/source

Dockerfile

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,27 @@ RUN curl -L -X GET https://go.dev/dl/go1.24.1.linux-amd64.tar.gz -o /tmp/go1.24.
4444
&& tar -C /usr/local -xzf /tmp/go1.24.1.linux-amd64.tar.gz \
4545
&& rm /tmp/go1.24.1.linux-amd64.tar.gz
4646

47+
# --- Temurin JDK 22 (amd64/x86_64) ---
48+
ARG JDK_URL="https://github.com/adoptium/temurin22-binaries/releases/download/jdk-22.0.2%2B9/OpenJDK22U-jdk_x64_linux_hotspot_22.0.2_9.tar.gz"
49+
ARG JDK_DIR="jdk-22.0.2+9"
50+
RUN mkdir -p /opt/jdk \
51+
&& curl -fsSL -o /tmp/jdk.tgz "${JDK_URL}" \
52+
&& tar -C /opt/jdk -xzf /tmp/jdk.tgz \
53+
&& rm -f /tmp/jdk.tgz
54+
ENV JAVA_HOME=/opt/jdk/${JDK_DIR}
55+
ENV PATH="${JAVA_HOME}/bin:${PATH}"
56+
57+
# --- Maven 3.9.11 (optional) ---
58+
ARG MVN_VER=3.9.11
59+
RUN curl -fsSL -o /tmp/maven.tgz \
60+
"https://archive.apache.org/dist/maven/maven-3/${MVN_VER}/binaries/apache-maven-${MVN_VER}-bin.tar.gz" \
61+
&& tar -C /opt -xzf /tmp/maven.tgz \
62+
&& rm -f /tmp/maven.tgz
63+
ENV PATH="/opt/apache-maven-${MVN_VER}/bin:${PATH}"
64+
65+
# Verify
66+
RUN java -version && mvn -v
67+
4768
# Set SSL environment variables
4869
ENV REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
4970
ENV SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt

src/vuln_analysis/tools/tests/test_transitive_code_search.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,3 +300,77 @@ async def test_c_transitive_search_2():
300300
print(f"DEBUG: len(list_path) = {len(list_path)}")
301301
assert len(list_path) == 1
302302
assert path_found == False
303+
304+
@pytest.mark.asyncio
305+
async def test_transitive_search_java_1():
306+
transitive_code_search_runner_coroutine = await get_transitive_code_runner_function()
307+
set_input_for_next_run(git_repository="https://github.com/cryostatio/cryostat",
308+
git_ref="8f753753379e9381429b476aacbf6890ef101438",
309+
included_extensions=["**/*.java"],
310+
excluded_extensions=["target/**/*",
311+
"build/**/*",
312+
"*.class",
313+
".gradle/**/*",
314+
".mvn/**/*",
315+
".gitignore",
316+
"test/**/*",
317+
"tests/**/*",
318+
"src/test/**/*",
319+
"pom.xml",
320+
"build.gradle"])
321+
result = await transitive_code_search_runner_coroutine("commons-beanutils:commons-beanutils:1.9.4,org.apache.commons.beanutils.PropertyUtilsBean.getProperty")
322+
(path_found, list_path) = result
323+
print(result)
324+
assert path_found is False
325+
assert len(list_path) is 1
326+
327+
@pytest.mark.asyncio
328+
async def test_transitive_search_java_2():
329+
transitive_code_search_runner_coroutine = await get_transitive_code_runner_function()
330+
set_input_for_next_run(git_repository="https://github.com/cryostatio/cryostat",
331+
git_ref="8f753753379e9381429b476aacbf6890ef101438",
332+
included_extensions=["**/*.java"],
333+
excluded_extensions=["target/**/*",
334+
"build/**/*",
335+
"*.class",
336+
".gradle/**/*",
337+
".mvn/**/*",
338+
".gitignore",
339+
"test/**/*",
340+
"tests/**/*",
341+
"src/test/**/*",
342+
"pom.xml",
343+
"build.gradle"])
344+
result = await transitive_code_search_runner_coroutine("org.apache.commons:commons-lang3:3.14.0,org.apache.commons.lang3.StringUtils.isBlank")
345+
(path_found, list_path) = result
346+
print(result)
347+
assert path_found is True
348+
assert len(list_path) is 2
349+
document = list_path[1]
350+
assert 'src/main/java/io/cryostat' in document.metadata['source']
351+
assert 'StringUtils.isBlank(' in document.page_content
352+
353+
# @pytest.mark.asyncio
354+
# async def test_transitive_search_java_3():
355+
# transitive_code_search_runner_coroutine = await get_transitive_code_runner_function()
356+
# set_input_for_next_run(git_repository="https://github.com/cryostatio/cryostat",
357+
# git_ref="8f753753379e9381429b476aacbf6890ef101438",
358+
# included_extensions=["**/*.java"],
359+
# excluded_extensions=["target/**/*",
360+
# "build/**/*",
361+
# "*.class",
362+
# ".gradle/**/*",
363+
# ".mvn/**/*",
364+
# ".gitignore",
365+
# "test/**/*",
366+
# "tests/**/*",
367+
# "src/test/**/*",
368+
# "pom.xml",
369+
# "build.gradle"])
370+
# result = await transitive_code_search_runner_coroutine(",java.net.URLEncoder.encode")
371+
# (path_found, list_path) = result
372+
# print(result)
373+
# assert path_found is True
374+
# assert len(list_path) > 1
375+
# document = list_path[-1]
376+
# assert 'src/main/java/io/cryostat' in document.metadata['source']

src/vuln_analysis/tools/transitive_code_search.py

Lines changed: 44 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
15-
1615
import os
1716

1817
from vuln_analysis.runtime_context import ctx_state
@@ -26,14 +25,18 @@
2625
from langchain.docstore.document import Document
2726

2827
from vuln_analysis.data_models.state import AgentMorpheusEngineState
29-
from ..utils.chain_of_calls_retriever import ChainOfCallsRetriever
30-
from vuln_analysis.utils.dep_tree import Ecosystem
3128
from vuln_analysis.utils.document_embedding import DocumentEmbedding
29+
from ..data_models.input import SourceDocumentsInfo
30+
from ..utils.chain_of_calls_retriever_base import ChainOfCallsRetrieverBase
31+
from ..utils.chain_of_calls_retriever_factory import get_chain_of_calls_retriever
32+
from ..utils.dep_tree import Ecosystem
3233
from ..utils.error_handling_decorator import catch_pipeline_errors_async, catch_tool_errors
3334
from ..utils.function_name_extractor import FunctionNameExtractor
3435
from ..utils.function_name_locator import FunctionNameLocator
3536

3637
from vuln_analysis.logging.loggers_factory import LoggingFactory
38+
from ..utils.java_chain_of_calls_retriever import JavaChainOfCallsRetriever
39+
3740

3841
PACKAGE_AND_FUNCTION_LOCATOR_TOOL_NAME = "package_and_function_locator"
3942

@@ -60,29 +63,33 @@ class PackageAndFunctionLocatorToolConfig(FunctionBaseConfig, name=("%s" % PACKA
6063
Package and function locator tool used to validate package names and find function names using fuzzy matching.
6164
"""
6265

63-
64-
def get_call_of_chains_retriever(documents_embedder, si):
66+
def get_call_of_chains_retriever(documents_embedder, si, query: str):
6567
documents: list[Document]
6668
git_repo = None
69+
code_source_info: SourceDocumentsInfo
6770
for source_info in si:
6871
if source_info.type == "code":
72+
code_source_info = source_info
6973
git_repo = documents_embedder.get_repo_path(source_info)
7074
documents = documents_embedder.collect_documents(source_info)
7175
if git_repo is None:
7276
raise ValueError("No code source info found")
7377
with open(os.path.join(git_repo, 'ecosystem_data.txt'), 'r', encoding='utf-8') as file:
7478
ecosystem = file.read()
7579
ecosystem = Ecosystem[ecosystem]
76-
coc_retriever = ChainOfCallsRetriever(documents=documents, ecosystem=ecosystem, manifest_path=git_repo)
80+
coc_retriever = get_chain_of_calls_retriever(ecosystem=ecosystem,
81+
documents=documents,
82+
manifest_path=git_repo,
83+
query=query,
84+
code_source_info=code_source_info)
7785
return coc_retriever
7886

79-
80-
def get_transitive_code_searcher():
87+
def get_transitive_code_searcher(query: str):
8188
state: AgentMorpheusEngineState = ctx_state.get()
82-
if state.transitive_code_searcher is None:
89+
if state.transitive_code_searcher is None or isinstance(state.transitive_code_searcher.chain_of_calls_retriever, JavaChainOfCallsRetriever):
8390
si = state.original_input.input.image.source_info
8491
documents_embedder = DocumentEmbedding(embedding=None)
85-
coc_retriever = get_call_of_chains_retriever(documents_embedder, si)
92+
coc_retriever = get_call_of_chains_retriever(documents_embedder, si, query)
8693
transitive_code_searcher = TransitiveCodeSearcher(chain_of_calls_retriever=coc_retriever)
8794
state.transitive_code_searcher = transitive_code_searcher
8895
return state.transitive_code_searcher
@@ -108,16 +115,22 @@ async def transitive_search(config: TransitiveCodeSearchToolConfig,
108115
@catch_tool_errors(TRANSITIVE_CODE_SEARCH_TOOL_NAME)
109116
async def _arun(query: str) -> tuple:
110117
transitive_code_searcher: TransitiveCodeSearcher
111-
transitive_code_searcher = get_transitive_code_searcher()
118+
transitive_code_searcher = get_transitive_code_searcher(query)
112119
result = transitive_code_searcher.search(query)
113120
return result
114121

115122
yield FunctionInfo.from_fn(
116123
_arun,
117124
description=("""
118-
Checks if a function from a package is reachable from application code through the call chain.
119-
Input format: 'package_name,function_name'.
120-
Example: 'urllib,parse'.
125+
Checks if a function from a package is reachable from application code through the call chain.
126+
Make sure the input format is matching exactly one of the following formats:
127+
128+
Input format 1: 'package_name,function_name'.
129+
Example 1: 'urllib,parse'.
130+
131+
Input format 2(java): 'maven_gav,class_name.function_name'.
132+
Example 2(java): 'commons-beanutils:commons-beanutils:1.0.0,PropertyUtilsBean.setSimpleProperty'.
133+
121134
Returns: (is_reachable: bool, call_hierarchy_path: list).
122135
"""))
123136

@@ -131,9 +144,9 @@ async def functions_usage_search(config: CallingFunctionNameExtractorToolConfig,
131144
"""
132145
@catch_tool_errors(FUNCTION_NAME_EXTRACTOR_TOOL_NAME)
133146
async def _arun(query: str) -> list:
134-
coc_retriever: ChainOfCallsRetriever
147+
coc_retriever: ChainOfCallsRetrieverBase
135148
transitive_code_searcher: TransitiveCodeSearcher
136-
transitive_code_searcher = get_transitive_code_searcher()
149+
transitive_code_searcher = get_transitive_code_searcher(query)
137150
coc_retriever = transitive_code_searcher.chain_of_calls_retriever
138151
function_name_extractor = FunctionNameExtractor(coc_retriever)
139152
result = function_name_extractor.fetch_list(query)
@@ -154,33 +167,38 @@ async def package_and_function_locator(config: PackageAndFunctionLocatorToolConf
154167
builder: Builder): # pylint: disable=unused-argument
155168
"""
156169
Function Locator tool used to validate package names and find function names using fuzzy matching.
157-
Mandatory first step for code path analysis.
170+
Mandatory first step for code path analysis.
158171
"""
159172

160173
@catch_tool_errors(PACKAGE_AND_FUNCTION_LOCATOR_TOOL_NAME)
161174
async def _arun(query: str) -> dict:
162-
coc_retriever: ChainOfCallsRetriever
175+
coc_retriever: ChainOfCallsRetrieverBase
163176
transitive_code_searcher: TransitiveCodeSearcher
164-
transitive_code_searcher = get_transitive_code_searcher()
177+
transitive_code_searcher = get_transitive_code_searcher(query)
165178
coc_retriever = transitive_code_searcher.chain_of_calls_retriever
166179
locator = FunctionNameLocator(coc_retriever)
167180
result = await locator.locate_functions(query)
168181
pkg_msg = "Package is valid."
169-
if not locator.is_package_valid and not locator.is_std_package:
170-
pkg_msg = "Package is not valid."
171-
172-
182+
if not locator.is_package_valid and not locator.is_std_package:
183+
pkg_msg = "Package is not valid."
184+
173185
return {
174186
"ecosystem": coc_retriever.ecosystem.name,
175-
"package_msg": pkg_msg,
187+
"package_msg": pkg_msg,
176188
"result": result
177189
}
178190

179191
yield FunctionInfo.from_fn(
180192
_arun,
181193
description=("""
182194
Mandatory first step for code path analysis. Validates package names, locates functions using fuzzy matching, and provides ecosystem type (GO/Python/Java/JavaScript/C/C++).
183-
Input format: 'package_name,function_name' or 'package_name,class_name.method_name'
184-
Example: 'libxml2,xmlParseDocument'
195+
Make sure the input format is matching exactly one of the following formats:
196+
197+
Input format 1: 'package_name,function_name' or 'package_name,class_name.method_name'.
198+
Example 1: 'libxml2,xmlParseDocument'.
199+
200+
Input format 2(java): 'maven_gav,class_name.method_name'.
201+
Example 2(java): 'commons-beanutils:commons-beanutils:1.0.0,PropertyUtilsBean.setSimpleProperty'.
202+
185203
Returns: {'ecosystem': str, 'package_msg': str, 'result': [function_names]}.
186204
"""))

0 commit comments

Comments
 (0)