Skip to content

Commit bfda88d

Browse files
committed
[llvm-advisor] Add code analysis and binary inspection parsers
- implement AST JSON parser for syntax tree analysis - add LLVM IR parser for intermediate representation analysis - add assembly parser for machine code inspection - add binary size parser for executable size optimization - add symbols parser for debugging and analysis support
1 parent 4bf0e36 commit bfda88d

File tree

5 files changed

+609
-0
lines changed

5 files changed

+609
-0
lines changed
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
# ===----------------------------------------------------------------------===//
2+
#
3+
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
# See https://llvm.org/LICENSE.txt for license information.
5+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
#
7+
# ===----------------------------------------------------------------------===//
8+
9+
import re
10+
from typing import Dict, List, Any
11+
from .base_parser import BaseParser
12+
from ..models import FileType, ParsedFile
13+
14+
15+
class AssemblyParser(BaseParser):
16+
def __init__(self):
17+
super().__init__(FileType.ASSEMBLY)
18+
self.label_pattern = re.compile(r"^(\w+):")
19+
self.instruction_pattern = re.compile(r"^\s+(\w+)")
20+
self.section_pattern = re.compile(r"^\s*\.(text|data|bss|rodata)")
21+
22+
def parse(self, file_path: str) -> ParsedFile:
23+
if self.is_large_file(file_path):
24+
return self._parse_large_assembly(file_path)
25+
26+
content = self.read_file_safe(file_path)
27+
if content is None:
28+
return self.create_parsed_file(
29+
file_path, {}, {"error": "File too large or unreadable"}
30+
)
31+
32+
try:
33+
lines = content.split("\n")
34+
asm_data = self._analyze_assembly_content(lines)
35+
36+
metadata = {
37+
"file_size": self.get_file_size(file_path),
38+
"total_lines": len(lines),
39+
**asm_data["summary"],
40+
}
41+
42+
return self.create_parsed_file(file_path, asm_data, metadata)
43+
44+
except Exception as e:
45+
return self.create_parsed_file(file_path, {}, {"error": str(e)})
46+
47+
def _parse_large_assembly(self, file_path: str) -> ParsedFile:
48+
try:
49+
asm_data = {"labels": [], "instructions": {}, "sections": [], "summary": {}}
50+
line_count = 0
51+
52+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
53+
for line in f:
54+
line_count += 1
55+
56+
# Only parse first 5000 lines for large files
57+
if line_count > 5000:
58+
break
59+
60+
line = line.strip()
61+
if not line or line.startswith("#") or line.startswith(";"):
62+
continue
63+
64+
# Parse labels
65+
label_match = self.label_pattern.match(line)
66+
if label_match:
67+
asm_data["labels"].append(label_match.group(1))
68+
69+
# Parse instructions
70+
inst_match = self.instruction_pattern.match(line)
71+
if inst_match:
72+
inst = inst_match.group(1)
73+
asm_data["instructions"][inst] = (
74+
asm_data["instructions"].get(inst, 0) + 1
75+
)
76+
77+
# Parse sections
78+
section_match = self.section_pattern.match(line)
79+
if section_match:
80+
asm_data["sections"].append(section_match.group(1))
81+
82+
asm_data["summary"] = {
83+
"label_count": len(asm_data["labels"]),
84+
"instruction_types": len(asm_data["instructions"]),
85+
"total_instructions": sum(asm_data["instructions"].values()),
86+
"section_count": len(set(asm_data["sections"])),
87+
"analyzed_lines": line_count,
88+
"is_partial": True,
89+
}
90+
91+
metadata = {
92+
"file_size": self.get_file_size(file_path),
93+
**asm_data["summary"],
94+
}
95+
96+
return self.create_parsed_file(file_path, asm_data, metadata)
97+
98+
except Exception as e:
99+
return self.create_parsed_file(file_path, {}, {"error": str(e)})
100+
101+
def _analyze_assembly_content(self, lines: List[str]) -> Dict[str, Any]:
102+
asm_data = {"labels": [], "instructions": {}, "sections": [], "summary": {}}
103+
104+
for line in lines:
105+
original_line = line
106+
line = line.strip()
107+
108+
if not line or line.startswith("#") or line.startswith(";"):
109+
continue
110+
111+
# Parse labels
112+
label_match = self.label_pattern.match(line)
113+
if label_match:
114+
asm_data["labels"].append(label_match.group(1))
115+
continue
116+
117+
# Parse instructions
118+
inst_match = self.instruction_pattern.match(original_line)
119+
if inst_match:
120+
inst = inst_match.group(1)
121+
asm_data["instructions"][inst] = (
122+
asm_data["instructions"].get(inst, 0) + 1
123+
)
124+
continue
125+
126+
# Parse sections
127+
section_match = self.section_pattern.match(line)
128+
if section_match:
129+
asm_data["sections"].append(section_match.group(1))
130+
131+
asm_data["summary"] = {
132+
"label_count": len(asm_data["labels"]),
133+
"instruction_types": len(asm_data["instructions"]),
134+
"total_instructions": sum(asm_data["instructions"].values()),
135+
"section_count": len(set(asm_data["sections"])),
136+
}
137+
138+
return asm_data
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# ===----------------------------------------------------------------------===//
2+
#
3+
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
# See https://llvm.org/LICENSE.txt for license information.
5+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
#
7+
# ===----------------------------------------------------------------------===//
8+
9+
import json
10+
from typing import Dict, Any
11+
from .base_parser import BaseParser
12+
from ..models import FileType, ParsedFile
13+
14+
15+
class ASTParser(BaseParser):
16+
def __init__(self):
17+
super().__init__(FileType.AST_JSON)
18+
19+
def parse(self, file_path: str) -> ParsedFile:
20+
if self.is_large_file(file_path):
21+
return self._parse_large_ast(file_path)
22+
23+
content = self.read_file_safe(file_path)
24+
if content is None:
25+
return self.create_parsed_file(
26+
file_path, {}, {"error": "File too large or unreadable"}
27+
)
28+
29+
try:
30+
ast_data = json.loads(content)
31+
32+
# Extract summary information
33+
summary = self._extract_ast_summary(ast_data)
34+
35+
metadata = {
36+
"file_size": self.get_file_size(file_path),
37+
"ast_summary": summary,
38+
}
39+
40+
return self.create_parsed_file(file_path, ast_data, metadata)
41+
42+
except Exception as e:
43+
return self.create_parsed_file(file_path, {}, {"error": str(e)})
44+
45+
def _parse_large_ast(self, file_path: str) -> ParsedFile:
46+
try:
47+
# For large AST files, just extract basic info
48+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
49+
# Read first chunk to get basic structure
50+
chunk = f.read(10000) # 10KB
51+
52+
# Try to parse at least the root node
53+
if chunk.startswith("{"):
54+
bracket_count = 0
55+
for i, char in enumerate(chunk):
56+
if char == "{":
57+
bracket_count += 1
58+
elif char == "}":
59+
bracket_count -= 1
60+
if bracket_count == 0:
61+
try:
62+
partial_data = json.loads(chunk[: i + 1])
63+
summary = self._extract_ast_summary(
64+
partial_data, partial=True
65+
)
66+
67+
metadata = {
68+
"file_size": self.get_file_size(file_path),
69+
"ast_summary": summary,
70+
"is_partial": True,
71+
}
72+
73+
return self.create_parsed_file(
74+
file_path, partial_data, metadata
75+
)
76+
except:
77+
break
78+
79+
metadata = {
80+
"file_size": self.get_file_size(file_path),
81+
"error": "File too large to parse completely",
82+
}
83+
84+
return self.create_parsed_file(file_path, {}, metadata)
85+
86+
except Exception as e:
87+
return self.create_parsed_file(file_path, {}, {"error": str(e)})
88+
89+
def _extract_ast_summary(
90+
self, ast_data: Dict[str, Any], partial: bool = False
91+
) -> Dict[str, Any]:
92+
summary = {
93+
"root_kind": ast_data.get("kind", "unknown"),
94+
"root_id": ast_data.get("id", "unknown"),
95+
"has_inner": "inner" in ast_data,
96+
"is_partial": partial,
97+
}
98+
99+
if "inner" in ast_data and isinstance(ast_data["inner"], list):
100+
summary["inner_count"] = len(ast_data["inner"])
101+
102+
# Count node types
103+
node_types = {}
104+
for node in ast_data["inner"]:
105+
if isinstance(node, dict) and "kind" in node:
106+
kind = node["kind"]
107+
node_types[kind] = node_types.get(kind, 0) + 1
108+
109+
summary["node_types"] = node_types
110+
111+
return summary
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# ===----------------------------------------------------------------------===//
2+
#
3+
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
# See https://llvm.org/LICENSE.txt for license information.
5+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
#
7+
# ===----------------------------------------------------------------------===//
8+
9+
import re
10+
from typing import Dict, List, Any
11+
from .base_parser import BaseParser
12+
from ..models import FileType, ParsedFile, BinarySize
13+
14+
15+
class BinarySizeParser(BaseParser):
16+
def __init__(self):
17+
super().__init__(FileType.BINARY_SIZE)
18+
# Pattern for size output like: "1234 5678 90 12345 section_name"
19+
self.size_pattern = re.compile(r"^\s*(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.+)$")
20+
# Pattern for nm-style output with size
21+
self.nm_pattern = re.compile(
22+
r"^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)\s+([A-Za-z])\s+(.+)$"
23+
)
24+
25+
def parse(self, file_path: str) -> ParsedFile:
26+
content = self.read_file_safe(file_path)
27+
if content is None:
28+
return self.create_parsed_file(
29+
file_path, [], {"error": "File too large or unreadable"}
30+
)
31+
32+
try:
33+
lines = content.split("\n")
34+
size_data = self._parse_size_output(lines)
35+
36+
total_size = sum(item.size for item in size_data)
37+
38+
metadata = {
39+
"file_size": self.get_file_size(file_path),
40+
"total_sections": len(size_data),
41+
"total_binary_size": total_size,
42+
}
43+
44+
return self.create_parsed_file(file_path, size_data, metadata)
45+
46+
except Exception as e:
47+
return self.create_parsed_file(file_path, [], {"error": str(e)})
48+
49+
def _parse_size_output(self, lines: List[str]) -> List[BinarySize]:
50+
size_data = []
51+
total_size = 0
52+
53+
for line in lines:
54+
line = line.strip()
55+
if not line or line.startswith("#"):
56+
continue
57+
58+
# Try standard size format first
59+
size_match = self.size_pattern.match(line)
60+
if size_match:
61+
text_size = int(size_match.group(1))
62+
data_size = int(size_match.group(2))
63+
bss_size = int(size_match.group(3))
64+
total = int(size_match.group(4))
65+
name = size_match.group(5)
66+
67+
# Add individual sections if they have non-zero sizes
68+
if text_size > 0:
69+
size_data.append(BinarySize(section=f"{name}.text", size=text_size))
70+
if data_size > 0:
71+
size_data.append(BinarySize(section=f"{name}.data", size=data_size))
72+
if bss_size > 0:
73+
size_data.append(BinarySize(section=f"{name}.bss", size=bss_size))
74+
75+
total_size += total
76+
continue
77+
78+
# Try nm-style format
79+
nm_match = self.nm_pattern.match(line)
80+
if nm_match:
81+
address = nm_match.group(1)
82+
size = int(nm_match.group(2), 16)
83+
symbol_type = nm_match.group(3)
84+
name = nm_match.group(4)
85+
86+
size_data.append(BinarySize(section=name, size=size))
87+
total_size += size
88+
continue
89+
90+
# Try to parse generic "section: size" format
91+
if ":" in line:
92+
parts = line.split(":", 1)
93+
if len(parts) == 2:
94+
section_name = parts[0].strip()
95+
size_part = parts[1].strip()
96+
97+
# Extract numeric size
98+
size_numbers = re.findall(r"\d+", size_part)
99+
if size_numbers:
100+
size = int(size_numbers[0])
101+
size_data.append(BinarySize(section=section_name, size=size))
102+
total_size += size
103+
104+
# Calculate percentages
105+
if total_size > 0:
106+
for item in size_data:
107+
item.percentage = (item.size / total_size) * 100
108+
109+
return size_data

0 commit comments

Comments
 (0)