This repository was archived by the owner on Sep 24, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathextract-html-metadatas.py
More file actions
36 lines (36 loc) · 1.44 KB
/
extract-html-metadatas.py
File metadata and controls
36 lines (36 loc) · 1.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import sys
import re
"""
PIPER script to extract interesting information from HTML META tags from a HTTP response.
Target tool: Message viewers
Require that HTTP headers NOT been passed
Filters needed via matching of the case insensitive regex for the header Content-Type: [a-z]+\/x?(html)
"""
# Match all meta tags pattern holding "generator" and "framework" information
# whatever the order of the attributes NAME and CONTENT
exprs = []
exprs.append(r'<meta\sname=[\'"](framework|generator)[\'"]\scontent=[\'"]([\d\w\.\s\-,\/:\(\)]+)[\'"]\s*\/?>')
exprs.append(r'<meta\scontent=[\'"]([\d\w\.\s\-,\/:\(\)]+)[\'"]\sname=[\'"](framework|generator)[\'"]\s*\/?>')
# Extract the whole response body
content = "".join(sys.stdin)
# Extract the metadatas via the regex and handle duplicates
results = []
# First regex working on order 1) NAME 2) CONTENT
metadatas = re.findall(exprs[0], content, re.IGNORECASE | re.MULTILINE)
for metadata in metadatas:
msg = f"{metadata[0].capitalize()}: {metadata[1]}"
if msg not in results:
results.append(msg)
# Second regex working on order 1) CONTENT 2) NAME
metadatas = re.findall(exprs[1], content, re.IGNORECASE | re.MULTILINE)
for metadata in metadatas:
msg = f"{metadata[1].capitalize()}: {metadata[0]}"
if msg not in results:
results.append(msg)
count = len(results)
if count > 0:
results.sort()
print(f"{count} metadata(s) found:")
print("\n".join(results))
else:
print("No metadata found.")