forked from cyyself/m1-pmu-gen
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapple_pdf_extract.py
More file actions
executable file
·58 lines (56 loc) · 2.17 KB
/
apple_pdf_extract.py
File metadata and controls
executable file
·58 lines (56 loc) · 2.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python3
# SPDX-License-Identifier: GPL-2.0
#
# Used to Extract PMU event description from Apple Silicon CPU Optimization
# Guide which can be downloaded from
# https://developer.apple.com/download/apple-silicon-cpu-optimization-guide/
def extract_event_desc(pdf_path):
import pymupdf
import re
import tqdm
import sys
def is_event_name(s):
s = s.strip()
return re.match(r"^[A-Z][A-Z0-9_]+$", s)
doc = pymupdf.open(pdf_path)
result = dict()
for page in tqdm.tqdm(doc.pages(), total=doc.page_count):
text = page.get_text()
if "Event Name" not in text or \
"Brief Description" not in text:
continue
tabs = page.find_tables()
if tabs.tables:
for tab in tabs.tables:
cur_table = tab.extract()
try:
header = cur_table[0]
if header[0].strip() == 'Event Name' and \
header[1].strip() == 'Brief Description':
# Here we start to extract event description
for row in cur_table[1:]:
event = ""
desc = ""
try:
event = row[0].strip()
desc = row[1].strip()
except:
continue
if "\n" in event:
event = event.split("\n")[0].strip()
if not is_event_name(event):
continue
result[event] = " ".join([x.strip() for x in desc.split("\n")])
except:
print("Failed to extract table on page:", page.number, file=sys.stderr)
exit(1)
return result
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: ./apple_pdf_extract.py <pdf_path to Apple-Silicon-CPU-Optimization-Guide.pdf>")
sys.exit(1)
file = sys.argv[1]
res = extract_event_desc(file)
for key, value in res.items():
print(f"{key}: {value}")