Skip to content

Commit a5b8e84

Browse files
authored
data-mining-logs: extract documents and messages (#5)
* Add Python log_parser.py Get the meaningful events to monitor messages and put as: { '<log timestamp>': {json event update} } * Remove prefix parsed from generated file * data-mining-logs: log parsing + extraction of documents and messages
1 parent 7015110 commit a5b8e84

File tree

5 files changed

+160
-0
lines changed

5 files changed

+160
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,5 @@
1818

1919
*.log
2020
/kills.txt
21+
*.mypy_cache
22+
*.coverage

data-mining-logs/.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
docs/
2+
token.txt
3+
logs/
4+
.venv
5+
.mypy_cache

data-mining-logs/extract.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import json
2+
import os
3+
import telepot # notype
4+
from tqdm import tqdm # notype
5+
6+
token = open("token.txt").read().strip()
7+
bot = telepot.Bot(token)
8+
chat_title = "Common Lisp Brasil"
9+
logs_fpath = "logs/putaria.log.json"
10+
dir_name = os.path.join("docs", chat_title.replace(" ", "_").lower())
11+
12+
13+
def get_title(result):
14+
return result.get("message", {}).get("chat", {}).get("title")
15+
16+
17+
def collect_documents(chat_title=chat_title):
18+
doc_types = [
19+
"document",
20+
"video",
21+
"voice",
22+
"photo"
23+
]
24+
mime_type = {
25+
"video": "video/mp4",
26+
"voice": "audio/ogg",
27+
"photo": "image/jpg",
28+
}
29+
logs = json.load(open(logs_fpath))
30+
docs = []
31+
for timestamp, event in logs.items():
32+
for result in event["result"]:
33+
if get_title(result) == chat_title:
34+
for doc_type in doc_types:
35+
doc = result["message"].get(doc_type)
36+
if isinstance(doc, list):
37+
doc = doc[-1] # multiple thumbs, get the best quality
38+
doc["mime_type"] = mime_type[doc_type]
39+
if doc:
40+
docs.append(doc)
41+
42+
return docs
43+
44+
45+
def collect_messages(chat_title=chat_title):
46+
logs = json.load(open(logs_fpath))
47+
docs = []
48+
for timestamp, event in logs.items():
49+
for result in event["result"]:
50+
if get_title(result) == chat_title:
51+
doc = result["message"]
52+
if doc:
53+
docs.append({
54+
"date": timestamp,
55+
"message": doc
56+
})
57+
return docs
58+
59+
60+
def dump_messages(messages):
61+
with open(os.path.join(dir_name, "messages.txt"), "w") as f:
62+
for message in sorted(messages, key = lambda x: x["date"]):
63+
date = message["date"]
64+
msg = message["message"]
65+
username = message["message"]["from"]["first_name"]
66+
text = message["message"].get("text")
67+
if text:
68+
template = f"{date} / {username}: {text}".replace("\n", " ")
69+
f.write(template + "\n")
70+
71+
72+
def download_document(doc, dir_name=dir_name):
73+
try:
74+
mime_type = doc.get("mime_type")
75+
extension = ".raw"
76+
if mime_type:
77+
extension = mime_type.replace("/", ".")
78+
elif doc.get("file_name"):
79+
extension = doc["file_name"]
80+
fname = doc["file_unique_id"] + extension
81+
folder = mime_type.split("/")[0]
82+
dir_path = os.path.join(dir_name, folder)
83+
os.makedirs(dir_path, exist_ok=True)
84+
fpath = os.path.join(dir_name, folder, fname)
85+
if not os.path.exists(fpath):
86+
bot.download_file(doc["file_id"], fpath)
87+
except telepot.exception.TelegramError as e:
88+
print(f"Telegram exception for {fname}: {e}")
89+
except Exception as e:
90+
print(f"Python exception, I screw up: {e}")
91+
92+
93+
def download_documents(docs):
94+
for doc in tqdm(docs):
95+
download_document(doc, dir_name)
96+
97+
98+
if __name__ == "__main__":
99+
os.makedirs(dir_name, exist_ok=True)
100+
print(f"-- Collecting documents in: {dir_name}")
101+
docs = collect_documents()
102+
download_documents(docs)
103+
print(f"-- Collecting messages in: {dir_name}")
104+
messages = collect_messages()
105+
dump_messages(messages)

data-mining-logs/parser.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/usr/bin/env python3
2+
import re
3+
import enum
4+
import sys
5+
import json
6+
7+
log_end_regex = re.compile(r"^(\d{4}/\d{2}/\d{2}.*).*")
8+
log_start_regex = re.compile(r"^(\d{4}/\d{2}/\d{2}.*).getUpdates.resp: ({.*)")
9+
10+
class ParsingState(enum.Enum):
11+
stop = 0
12+
start = 1
13+
14+
15+
def parse(fpath: str):
16+
logs = {}
17+
state_machine = ParsingState.stop
18+
with open(fpath) as f:
19+
key = ""
20+
for line in f.readlines():
21+
if log_start_regex.match(line):
22+
match = log_start_regex.search(line)
23+
key = match.group(1)
24+
value = match.group(2)
25+
logs[key] = value
26+
state_machine = ParsingState.start
27+
elif log_end_regex.match(line):
28+
state_machine = ParsingState.stop
29+
elif state_machine == ParsingState.start:
30+
logs[key] += line
31+
32+
for k, v in sorted(logs.items(), key = lambda x: x[1]):
33+
try:
34+
j = json.loads(v)
35+
if not j or not j["result"]:
36+
del logs[k]
37+
logs[k] = j
38+
except Exception as e:
39+
del logs[k]
40+
return logs
41+
42+
43+
if __name__ == '__main__':
44+
fname = sys.argv[1]
45+
logs = parse(fname)
46+
json.dump(logs, open(fname + ".json", "w"))

data-mining-logs/requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
telepot
2+
tqdm

0 commit comments

Comments
 (0)