feat: Updated code analyzers to follow new affected artefacts data structure

GermanMT · GermanMT · commit 778c25eba102 · 2025-09-06T17:08:10.000+02:00
diff --git a/app/utils/code_analyzer/codes/__init__.py b/app/utils/code_analyzer/codes/__init__.py
@@ -1,7 +1,8 @@
 from .cs_code_analyzer import cs_get_used_artifacts, cs_is_imported
 from .java_code_analyzer import java_get_used_artifacts, java_is_imported
 from .js_ts_code_analyzer import js_ts_get_used_artifacts, js_ts_is_imported
-from .py_code_analyzer import python_get_used_artifacts, python_is_imported
+from .py_code_analyzer import py_get_used_artifacts, py_is_imported
+from .rb_code_analyzer import rb_get_used_artifacts, rb_is_imported
 from .rs_code_analyzer import rs_get_used_artifacts, rs_is_imported
 
 __all__ = [
@@ -11,8 +12,10 @@
     "java_is_imported",
     "js_ts_get_used_artifacts",
     "js_ts_is_imported",
-    "python_get_used_artifacts",
-    "python_is_imported",
+    "py_get_used_artifacts",
+    "py_is_imported",
+    "rb_get_used_artifacts",
+    "rb_is_imported",
     "rs_get_used_artifacts",
     "rs_is_imported"
 ]
diff --git a/app/utils/code_analyzer/codes/cs_code_analyzer.py b/app/utils/code_analyzer/codes/cs_code_analyzer.py
@@ -2,6 +2,8 @@
 
 from regex import findall, search
 
+from .is_relevant import is_relevant
+
 
 async def cs_is_imported(file_path: str, namespace: str) -> Any:
     with open(file_path, encoding="utf-8") as file:
@@ -16,28 +18,32 @@ async def cs_get_used_artifacts(
     filename: str,
     namespace: str,
     cve_description: str,
-    affected_artefacts: dict[str, list[str]]
+    affected_artefacts: dict[str, dict[str, list[str]]]
 ) -> list[dict[str, Any]]:
     with open(filename, encoding="utf-8") as file:
         code = file.read()
         current_line = 1
         used_artifacts = await get_child_artifacts(namespace, code, cve_description, affected_artefacts)
         for line in code.split("\n"):
             if not search(r"using\s", line):
-                for (artifact, _type) in used_artifacts:
+                for (artifact, _type, source) in used_artifacts:
                     if artifact in line:
-                        used_artifacts[(artifact, _type)].append(current_line)
+                        used_artifacts[(artifact, _type, source)].append(current_line)
             current_line += 1
         used_artifacts = {
-            (artifact, _type): lines
-            for (artifact, _type), lines in used_artifacts.items()
+            (artifact, _type, source): lines
+            for (artifact, _type, source), lines in used_artifacts.items()
             if lines
         }
         result = []
-        for (artifact_name, artifact_type), used_in_lines in used_artifacts.items():
+        groups_by_name_type = {}
+        for (artifact_name, artifact_type, source), used_in_lines in used_artifacts.items():
+            groups_by_name_type.setdefault((artifact_name, artifact_type, used_in_lines), []).append(source)
+        for (artifact_name, artifact_type, used_in_lines), sources in groups_by_name_type.items():
             result.append({
                 "artifact_name": artifact_name,
                 "artifact_type": artifact_type,
+                "sources": sources,
                 "used_in_lines": used_in_lines
             })
         return result
@@ -48,27 +54,26 @@ async def get_child_artifacts(
     code: str,
     cve_description: str,
     affected_artefacts: dict[str, list[str]]
-) -> dict[tuple[str, str], list[int]]:
-    used_artifacts: dict[tuple[str, str], list[int]] = {}
-    def is_relevant(artifact: str, artifact_type: str) -> bool:
-        artifact_lower = artifact.lower()
-        if artifact_lower in cve_description.lower():
-            return True
-        return artifact in affected_artefacts.get(artifact_type, [])
-    for match in findall(rf"{parent}\.[^\(\)\s:;]+", code):
-        for artifact in match.split(".")[1:]:
-            clean = artifact.strip()
-            for artifact_type in affected_artefacts:
-                if is_relevant(clean, artifact_type):
-                    used_artifacts.setdefault((clean, artifact_type), [])
-    for match in findall(rf"using\s+{parent}\s*;\s*{{[^}}]+}}", code):
-        for artifact in match.split("{")[1].split("}")[0].split(","):
-            clean = artifact.strip()
-            for artifact_type in affected_artefacts:
-                if is_relevant(clean, artifact_type):
-                    used_artifacts.setdefault((clean, artifact_type), [])
+) -> dict[tuple[str, str, str], list[int]]:
+    used_artifacts: dict[tuple[str, str, str], list[int]] = {}
+    patterns = [
+        (rf"{parent}\.[^\(\)\s:;]+", "split_by_dot"),
+        (rf"using\s+{parent}\s*;\s*{{[^}}]+}}", "split_by_braces"),
+    ]
+    for pattern, split_type in patterns:
+        for match in findall(pattern, code):
+            if split_type == "split_by_dot":
+                artifacts = match.split(".")[1:]
+            elif split_type == "split_by_braces":
+                artifacts = match.split("{")[1].split("}")[0].split(",")
+            for artifact in artifacts:
+                clean = artifact.strip()
+                for source, artifact_types in affected_artefacts.items():
+                    for artifact_type, artefacts in artifact_types["artefacts"].items():
+                        if await is_relevant(clean, artefacts, cve_description):
+                            used_artifacts.setdefault((clean, artifact_type, source), [])
     aux = {}
-    for (artifact, _) in used_artifacts:
+    for (artifact, _, _) in used_artifacts:
         aux.update(await get_child_artifacts(artifact, code, cve_description, affected_artefacts))
     used_artifacts.update(aux)
     return used_artifacts
diff --git a/app/utils/code_analyzer/codes/java_code_analyzer.py b/app/utils/code_analyzer/codes/java_code_analyzer.py
@@ -2,6 +2,8 @@
 
 from regex import findall, search
 
+from .is_relevant import is_relevant
+
 
 async def java_is_imported(file_path: str, dependency: str) -> Any:
     with open(file_path, encoding="utf-8") as file:
@@ -16,28 +18,32 @@ async def java_get_used_artifacts(
     filename: str,
     dependency: str,
     cve_description: str,
-    affected_artefacts: dict[str, list[str]]
+    affected_artefacts: dict[str, dict[str, list[str]]]
 ) -> list[dict[str, Any]]:
     with open(filename, encoding="utf-8") as file:
         code = file.read()
         current_line = 1
         used_artifacts = await get_child_artifacts(dependency, code, cve_description, affected_artefacts)
         for line in code.split("\n"):
             if "import" not in line:
-                for (artifact, _type) in used_artifacts:
+                for (artifact, _type, source) in used_artifacts:
                     if artifact in line:
-                        used_artifacts[(artifact, _type)].append(current_line)
+                        used_artifacts[(artifact, _type, source)].append(current_line)
             current_line += 1
         used_artifacts = {
-            (artifact, _type): lines
-            for (artifact, _type), lines in used_artifacts.items()
+            (artifact, _type, source): lines
+            for (artifact, _type, source), lines in used_artifacts.items()
             if lines
         }
         result = []
-        for (artifact_name, artifact_type), used_in_lines in used_artifacts.items():
+        groups_by_name_type = {}
+        for (artifact_name, artifact_type, source), used_in_lines in used_artifacts.items():
+            groups_by_name_type.setdefault((artifact_name, artifact_type, used_in_lines), []).append(source)
+        for (artifact_name, artifact_type, used_in_lines), sources in groups_by_name_type.items():
             result.append({
                 "artifact_name": artifact_name,
                 "artifact_type": artifact_type,
+                "sources": sources,
                 "used_in_lines": used_in_lines
             })
         return result
@@ -48,27 +54,29 @@ async def get_child_artifacts(
     code: str,
     cve_description: str,
     affected_artefacts: dict[str, list[str]]
-) -> dict[tuple[str, str], list[int]]:
-    used_artifacts: dict[tuple[str, str], list[int]] = {}
-    def is_relevant(artifact: str, artifact_type: str) -> bool:
-        artifact_lower = artifact.lower()
-        if artifact_lower in cve_description.lower():
-            return True
-        return artifact in affected_artefacts.get(artifact_type, [])
-    for match in findall(rf"{parent}\.[^\(\)\s:;]+", code):
-        for artifact in match.split(".")[1:]:
-            clean = artifact.strip()
-            for artifact_type in affected_artefacts:
-                if is_relevant(clean, artifact_type):
-                    used_artifacts.setdefault((clean, artifact_type), [])
-    for match in findall(rf"import\s+{parent}\.[^\(\)\s:;]+;", code):
-        for artifact in match.split(parent + ".")[1:]:
-            clean = artifact.replace(";", "").strip()
-            for artifact_type in affected_artefacts:
-                if is_relevant(clean, artifact_type):
-                    used_artifacts.setdefault((clean, artifact_type), [])
+) -> dict[tuple[str, str, str], list[int]]:
+    used_artifacts: dict[tuple[str, str, str], list[int]] = {}
+    patterns = [
+        (rf"{parent}\.[^\(\)\s:;]+", "split_by_dot"),
+        (rf"import\s+{parent}\.[^\(\)\s:;]+;", "split_by_import"),
+    ]
+    for pattern, split_type in patterns:
+        for match in findall(pattern, code):
+            if split_type == "split_by_dot":
+                artifacts = match.split(".")[1:]
+            elif split_type == "split_by_import":
+                artifacts = match.split(parent + ".")[1:]
+            for artifact in artifacts:
+                if split_type == "split_by_import":
+                    clean = artifact.replace(";", "").strip()
+                else:
+                    clean = artifact.strip()
+                for source, artifact_types in affected_artefacts.items():
+                    for artifact_type, artefacts in artifact_types["artefacts"].items():
+                        if await is_relevant(clean, artefacts, cve_description):
+                            used_artifacts.setdefault((clean, artifact_type, source), [])
     aux = {}
-    for (artifact, _) in used_artifacts:
+    for (artifact, _, _) in used_artifacts:
         aux.update(await get_child_artifacts(artifact, code, cve_description, affected_artefacts))
     used_artifacts.update(aux)
     return used_artifacts
diff --git a/app/utils/code_analyzer/codes/js_ts_code_analyzer.py b/app/utils/code_analyzer/codes/js_ts_code_analyzer.py
@@ -2,6 +2,8 @@
 
 from regex import findall, search
 
+from .is_relevant import is_relevant
+
 
 async def js_ts_is_imported(file_path: str, dependency: str) -> Any:
     with open(file_path, encoding="utf-8") as file:
@@ -16,28 +18,32 @@ async def js_ts_get_used_artifacts(
     filename: str,
     dependency: str,
     cve_description: str,
-    affected_artefacts: dict[str, list[str]]
+    affected_artefacts: dict[str, dict[str, list[str]]]
 ) -> list[dict[str, Any]]:
     with open(filename, encoding="utf-8") as file:
         code = file.read()
         current_line = 1
         used_artifacts = await get_child_artifacts(dependency, code, cve_description, affected_artefacts)
         for line in code.split("\n"):
             if not search(r"import\s|require\(", line):
-                for (artifact, _type) in used_artifacts:
+                for (artifact, _type, source) in used_artifacts:
                     if artifact in line:
-                        used_artifacts[(artifact, _type)].append(current_line)
+                        used_artifacts[(artifact, _type, source)].append(current_line)
             current_line += 1
         used_artifacts = {
-            (artifact, _type): lines
-            for (artifact, _type), lines in used_artifacts.items()
+            (artifact, _type, source): lines
+            for (artifact, _type, source), lines in used_artifacts.items()
             if lines
         }
         result = []
-        for (artifact_name, artifact_type), used_in_lines in used_artifacts.items():
+        groups_by_name_type = {}
+        for (artifact_name, artifact_type, source), used_in_lines in used_artifacts.items():
+            groups_by_name_type.setdefault((artifact_name, artifact_type, used_in_lines), []).append(source)
+        for (artifact_name, artifact_type, used_in_lines), sources in groups_by_name_type.items():
             result.append({
                 "artifact_name": artifact_name,
                 "artifact_type": artifact_type,
+                "sources": sources,
                 "used_in_lines": used_in_lines
             })
         return result
@@ -48,33 +54,27 @@ async def get_child_artifacts(
     code: str,
     cve_description: str,
     affected_artefacts: dict[str, list[str]]
-) -> dict[tuple[str, str], list[int]]:
-    used_artifacts: dict[tuple[str, str], list[int]] = {}
-    def is_relevant(artifact: str, artifact_type: str) -> bool:
-        artifact_lower = artifact.lower()
-        if artifact_lower in cve_description.lower():
-            return True
-        return artifact in affected_artefacts.get(artifact_type, [])
-    for match in findall(rf"{parent}\.[^\(\)\s:;]+", code):
-        for artifact in match.split(".")[1:]:
-            clean = artifact.strip()
-            for artifact_type in affected_artefacts:
-                if is_relevant(clean, artifact_type):
-                    used_artifacts.setdefault((clean, artifact_type), [])
-    for match in findall(rf"import\s+{{[^}}]+}}\s+from\s+['\"]{parent}['\"]", code):
-        for artifact in match.split("{")[1].split("}")[0].split(","):
-            clean = artifact.strip()
-            for artifact_type in affected_artefacts:
-                if is_relevant(clean, artifact_type):
-                    used_artifacts.setdefault((clean, artifact_type), [])
-    for match in findall(rf"const\s+{{[^}}]+}}\s*=\s*require\(['\"]{parent}['\"]\)", code):
-        for artifact in match.split("{")[1].split("}")[0].split(","):
-            clean = artifact.strip()
-            for artifact_type in affected_artefacts:
-                if is_relevant(clean, artifact_type):
-                    used_artifacts.setdefault((clean, artifact_type), [])
+) -> dict[tuple[str, str, str], list[int]]:
+    used_artifacts: dict[tuple[str, str, str], list[int]] = {}
+    patterns = [
+        (rf"{parent}\.[^\(\)\s:;]+", "split_by_dot"),
+        (rf"import\s+{{[^}}]+}}\s+from\s+['\"]{parent}['\"]", "split_by_braces"),
+        (rf"const\s+{{[^}}]+}}\s*=\s*require\(['\"]{parent}['\"]\)", "split_by_braces"),
+    ]
+    for pattern, split_type in patterns:
+        for match in findall(pattern, code):
+            if split_type == "split_by_dot":
+                artifacts = match.split(".")[1:]
+            elif split_type == "split_by_braces":
+                artifacts = match.split("{")[1].split("}")[0].split(",")
+            for artifact in artifacts:
+                clean = artifact.strip()
+                for source, artifact_types in affected_artefacts.items():
+                    for artifact_type, artefacts in artifact_types["artefacts"].items():
+                        if await is_relevant(clean, artefacts, cve_description):
+                            used_artifacts.setdefault((clean, artifact_type, source), [])
     aux = {}
-    for (artifact, _) in used_artifacts:
+    for (artifact, _, _) in used_artifacts:
         aux.update(await get_child_artifacts(artifact, code, cve_description, affected_artefacts))
     used_artifacts.update(aux)
     return used_artifacts
diff --git a/app/utils/code_analyzer/codes/py_code_analyzer.py b/app/utils/code_analyzer/codes/py_code_analyzer.py
diff --git a/app/utils/code_analyzer/codes/rs_code_analyzer.py b/app/utils/code_analyzer/codes/rs_code_analyzer.py