Skip to content

Commit b3cef7a

Browse files
authored
Merge pull request #327 from ClickHouse/log-clustering
Log clustering
2 parents b5e3207 + ff4ef09 commit b3cef7a

File tree

4 files changed

+387
-0
lines changed

4 files changed

+387
-0
lines changed
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/usr/bin/env python3
2+
import sys, json, argparse
3+
from collections import defaultdict
4+
from io import StringIO
5+
import contextlib
6+
from drain3 import TemplateMiner
7+
from drain3.template_miner_config import TemplateMinerConfig
8+
from drain3.file_persistence import FilePersistence
9+
10+
INI = """
11+
[SNAPSHOT]
12+
snapshot_interval_minutes = 10
13+
compress_state = True
14+
15+
[MASKING]
16+
masking = [
17+
{"regex_pattern":"((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)", "mask_with": "ID"},
18+
{"regex_pattern":"((?<=[^A-Za-z0-9])|^)(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})((?=[^A-Za-z0-9])|$)", "mask_with": "IP"},
19+
{"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"},
20+
{"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"},
21+
{"regex_pattern":"((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)", "mask_with": "HEX"},
22+
{"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"},
23+
{"regex_pattern":"(?i)([a-f0-9]{8}(?:-[a-f0-9]{4}){3}-[a-f0-9]{12})","mask_with":"UUID"},
24+
{"regex_pattern":"\\d{4}-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}:\\d{2}(?:\\.\\d+)?(?:Z|[+\\-]\\d{2}:\\d{2})?","mask_with":"TS"},
25+
{"regex_pattern":"\\\".*?\\\"","mask_with":"STR"},
26+
{"regex_pattern":"(?<=executed cmd )(\\".+?\\")", "mask_with": "CMD"}
27+
]
28+
mask_prefix = <:
29+
mask_suffix = :>
30+
31+
[DRAIN]
32+
sim_th = 0.4
33+
depth = 4
34+
max_children = 100
35+
max_clusters = 1024
36+
extra_delimiters = ["_"]
37+
38+
[PROFILING]
39+
enabled = False
40+
report_sec = 30
41+
"""
42+
def build_miner():
43+
cfg = TemplateMinerConfig()
44+
with contextlib.redirect_stdout(sys.stderr):
45+
cfg.load(StringIO(INI))
46+
cfg.config_file = None
47+
persistence = None
48+
return TemplateMiner(persistence, cfg)
49+
50+
def mine_summary(lines):
51+
miner = build_miner()
52+
counts = defaultdict(int)
53+
templates = {}
54+
total = 0
55+
56+
for raw in lines:
57+
if not raw:
58+
continue
59+
total += 1
60+
r = miner.add_log_message(raw)
61+
cid = r["cluster_id"]
62+
tmpl = r["template_mined"]
63+
counts[cid] += 1
64+
templates[cid] = tmpl
65+
66+
items = []
67+
for cid, cnt in counts.items():
68+
cov = (cnt / total * 100.0) if total else 0.0
69+
items.append({"template": templates[cid], "count": int(cnt), "coverage": round(cov, 2)})
70+
71+
items.sort(key=lambda x: (-x["count"], x["template"]))
72+
return items
73+
74+
def main():
75+
for line in sys.stdin:
76+
obj = json.loads(line)
77+
values = obj.get("values") or []
78+
strings = [s for s in values if isinstance(s, str)]
79+
result = mine_summary(strings)
80+
sys.stdout.write(json.dumps({"result": result}, ensure_ascii=False) + "\n")
81+
sys.stdout.flush()
82+
83+
if __name__ == "__main__":
84+
main()
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
<functions>
2+
<function>
3+
<type>executable_pool</type>
4+
<name>drain3_miner</name>
5+
<return_type>Array(String)</return_type>
6+
<return_name>result</return_name>
7+
<argument>
8+
<type>Array(String)</type>
9+
<name>values</name>
10+
</argument>
11+
<format>JSONEachRow</format>
12+
<command>drain3_miner.py</command>
13+
<execute_direct>1</execute_direct>
14+
<pool_size>1</pool_size>
15+
<max_command_execution_time>100</max_command_execution_time>
16+
<command_read_timeout>100000</command_read_timeout>
17+
<send_chunk_header>false</send_chunk_header>
18+
</function>
19+
</functions>
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
CREATE MATERIALIZED VIEW IF NOT EXISTS mv_logs_structured
2+
TO logs_structured
3+
AS
4+
SELECT
5+
ServiceName,
6+
/* which template matched */
7+
multiIf(m1, 1, m2, 2, m3, 3, m4, 4, m5, 5, m6, 6, 7) AS TemplateNumber,
8+
/* extracted fields as Map(LowCardinality(String), String) */
9+
CAST(
10+
multiIf(
11+
m1,
12+
map(
13+
'date', g1_1,
14+
'time', g1_2,
15+
'service_name', g1_3,
16+
'trace_sampled', g1_4,
17+
'prod_1', g1_5,
18+
'prod_2', g1_6,
19+
'prod_3', g1_7,
20+
'prod_4', g1_8,
21+
'prod_5', g1_9
22+
),
23+
m2,
24+
map(
25+
'prod_1', g2_1,
26+
'prod_2', g2_2,
27+
'prod_3', g2_3,
28+
'prod_4', g2_4,
29+
'prod_5', g2_5
30+
),
31+
m3,
32+
map('cart_1', g3_1),
33+
m4,
34+
map(), -- pattern4 has no captures; nothing to extract
35+
m5,
36+
map(
37+
'cart_1', g5_1,
38+
'cart_2', g5_2,
39+
'cart_3', g5_3
40+
),
41+
m6,
42+
map(
43+
'remote_addr', g6_1,
44+
'remote_user', g6_2,
45+
'time_local', g6_3,
46+
'request_type', g6_4,
47+
'request_path', g6_5,
48+
'request_protocol', g6_6,
49+
'status', g6_7,
50+
'size', g6_8,
51+
'referer', g6_9,
52+
'user_agent', g6_10
53+
),
54+
map() -- else: empty map
55+
),
56+
'Map(LowCardinality(String), String)'
57+
) AS Extracted
58+
FROM
59+
(
60+
/* compute once per row */
61+
WITH
62+
'^([^\\s]+) ([^\\s]+) INFO \[main\] \[recommendation_server.py:47\] \[trace_id=([^\\s]+) span_id=([^\\s]+) resource\.service\.name=recommendation trace_sampled=True\] - Receive ListRecommendations for product ids:\[([^\\s]+) ([^\\s]+) ([^\\s]+) ([^\\s]+) ([^\\s]+)\]$' AS pattern1,
63+
'^Receive ListRecommendations for product ([^\\s]+) ([^\\s]+) ([^\\s]+) ([^\\s]+) ([^\\s]+)$' AS pattern2,
64+
'^[\\s]*GetCartAsync called with userId=([^\\s]+)$' AS pattern3,
65+
'^info\: cart.cartstore.ValkeyCartStore\[0\]$' AS pattern4,
66+
'^[\\s]*AddItemAsync called with userId=([^\\s]+), productId=([^\\s]+), quantity=([^\\s]+)$' AS pattern5,
67+
'^(\S+) - (\S+) \[([^\]]+)\] "([A-Z]+)?\s*(.*?)\s*(HTTP\S+)?" (\d{3}) (\d+) "([^"]*)" "([^"]*)"$' AS pattern6
68+
69+
SELECT
70+
*,
71+
match(Body, pattern1) AS m1,
72+
match(Body, pattern2) AS m2,
73+
match(Body, pattern3) AS m3,
74+
match(Body, pattern4) AS m4,
75+
match(Body, pattern5) AS m5,
76+
match(Body, pattern6) AS m6,
77+
extractAllGroups(Body, pattern1) AS g1,
78+
extractAllGroups(Body, pattern2) AS g2,
79+
extractAllGroups(Body, pattern3) AS g3,
80+
extractAllGroups(Body, pattern4) AS g4,
81+
extractAllGroups(Body, pattern5) AS g5,
82+
extractAllGroups(Body, pattern6) AS g6,
83+
84+
/* pick first (and only) match’s capture groups */
85+
arrayElement(arrayElement(g1, 1), 1) AS g1_1,
86+
arrayElement(arrayElement(g1, 1), 2) AS g1_2,
87+
arrayElement(arrayElement(g1, 1), 3) AS g1_3,
88+
arrayElement(arrayElement(g1, 1), 4) AS g1_4,
89+
arrayElement(arrayElement(g1, 1), 5) AS g1_5,
90+
arrayElement(arrayElement(g1, 1), 6) AS g1_6,
91+
arrayElement(arrayElement(g1, 1), 7) AS g1_7,
92+
arrayElement(arrayElement(g1, 1), 7) AS g1_8,
93+
arrayElement(arrayElement(g1, 1), 7) AS g1_9,
94+
95+
arrayElement(arrayElement(g2, 1), 1) AS g2_1,
96+
arrayElement(arrayElement(g2, 1), 2) AS g2_2,
97+
arrayElement(arrayElement(g2, 1), 3) AS g2_3,
98+
arrayElement(arrayElement(g2, 1), 4) AS g2_4,
99+
arrayElement(arrayElement(g2, 1), 5) AS g2_5,
100+
101+
arrayElement(arrayElement(g3, 1), 1) AS g3_1,
102+
arrayElement(arrayElement(g5, 1), 1) AS g5_1,
103+
arrayElement(arrayElement(g5, 1), 2) AS g5_2,
104+
arrayElement(arrayElement(g5, 1), 3) AS g5_3,
105+
106+
arrayElement(arrayElement(g6, 1), 1) AS g6_1,
107+
arrayElement(arrayElement(g6, 1), 2) AS g6_2,
108+
arrayElement(arrayElement(g6, 1), 3) AS g6_3,
109+
arrayElement(arrayElement(g6, 1), 4) AS g6_4,
110+
arrayElement(arrayElement(g6, 1), 1) AS g6_5,
111+
arrayElement(arrayElement(g6, 1), 2) AS g6_6,
112+
arrayElement(arrayElement(g6, 1), 3) AS g6_7,
113+
arrayElement(arrayElement(g6, 1), 4) AS g6_8,
114+
arrayElement(arrayElement(g6, 1), 4) AS g6_9,
115+
arrayElement(arrayElement(g6, 1), 4) AS g6_10
116+
FROM logs_raw
117+
);
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
-- Create table for nginx logs
2+
CREATE TABLE logs_nginx
3+
(
4+
`remote_addr` IPv4,
5+
`remote_user` LowCardinality(String),
6+
`time_local` DateTime CODEC(Delta(4), ZSTD(1)),
7+
`request_type` LowCardinality(String),
8+
`request_path` String CODEC(ZSTD(6)),
9+
`request_protocol` LowCardinality(String),
10+
`status` UInt16,
11+
`size` UInt32,
12+
`referer` String CODEC(ZSTD(6)),
13+
`user_agent` LowCardinality(String),
14+
Body ALIAS format('{0} - {1} [{2}] "{3} {4} {5}" {6} {7} "{8}" "{9}"',remote_addr, remote_user, time_local, request_type, request_path, request_protocol, status, size, referer, user_agent)
15+
)
16+
ORDER BY (user_agent, remote_user, referer, request_path)
17+
18+
19+
-- Create table for recommendation service logs
20+
CREATE TABLE logs_service_recommendation
21+
(
22+
TemplateNumber UInt8,
23+
`date` String,
24+
`time` String,
25+
`service_name` Nullable(UUID),
26+
`trace_sampled` Nullable(UUID),
27+
`prod_1` LowCardinality(String),
28+
`prod_2` LowCardinality(String),
29+
`prod_3` LowCardinality(String),
30+
`prod_4` LowCardinality(String),
31+
`prod_5` LowCardinality(String),
32+
Body ALIAS multiIf(TemplateNumber=1, format('{0} {1} INFO [main] [recommendation_server.py:47] resource.service.name={2} trace_sampled={3}] - Receive ListRecommendations for product {4} {5} {6} {7} {8}',date,time,service_name,trace_sampled,prod_1,prod_2,prod_3,prod_4,prod_5), TemplateNumber=2, format('Receive ListRecommendations for product {0} {1} {2} {3} {4}',prod_1,prod_2,prod_3,prod_4,prod_5),''))
33+
ORDER BY (date, prod_1, prod_2, prod_3, prod_4, prod_5)
34+
35+
-- Create table for cart service logs
36+
CREATE TABLE logs_service_cart
37+
(
38+
TemplateNumber UInt8,
39+
`user_id` Nullable(UUID),
40+
`product_id` String,
41+
`quantity` String,
42+
Body ALIAS multiIf(
43+
TemplateNumber=1, format('GetCartAsync called with userId={0}',user_id),
44+
TemplateNumber=2, 'info: cart.cartstore.ValkeyCartStore[0]',
45+
TemplateNumber=3, format('AddItemAsync called with userId={0}, productId={1}, quantity={2}', user_id, product_id, quantity),
46+
TemplateNumber=4, format('EmptyCartAsync called with userId={0}',user_id),
47+
'')
48+
)
49+
ORDER BY (TemplateNumber, product_id, quantity)
50+
51+
-- Create materialized view for nginx logs
52+
CREATE MATERIALIZED VIEW IF NOT EXISTS mv_logs_nginx
53+
TO logs_service_nginx
54+
AS
55+
SELECT
56+
remote_address,
57+
remote_user,
58+
time_local,
59+
request_type,
60+
request_path,
61+
request_protocol,
62+
status,
63+
size,
64+
referer,
65+
user_agent
66+
FROM
67+
(
68+
WITH
69+
'^(\S+) - (\S+) \[([^\]]+)\] "([A-Z]+)?\s*(.*?)\s*(HTTP\S+)?" (\d{3}) (\d+) "([^"]*)" "([^"]*)"$' AS pattern
70+
SELECT
71+
*,
72+
match(Body, pattern) AS m1,
73+
extractAllGroups(Body, pattern) AS g,
74+
arrayElement(arrayElement(g, 1), 1) AS remote_address,
75+
arrayElement(arrayElement(g, 1), 2) AS remote_user,
76+
parseDateTimeBestEffort(arrayElement(arrayElement(g, 1), 3)) AS time_local,
77+
arrayElement(arrayElement(g, 1), 4) AS request_type,
78+
arrayElement(arrayElement(g, 1), 5) AS request_path,
79+
arrayElement(arrayElement(g, 1), 6) AS request_protocol,
80+
arrayElement(arrayElement(g, 1), 7) AS status,
81+
arrayElement(arrayElement(g, 1), 8) AS size,
82+
arrayElement(arrayElement(g, 1), 9) AS referer,
83+
arrayElement(arrayElement(g, 1), 10) AS user_agent
84+
FROM raw_logs where ServiceName='nginx'
85+
) WHERE m1;
86+
87+
-- Create materialized view for recommendation service logs
88+
CREATE MATERIALIZED VIEW IF NOT EXISTS mv_logs_recommendation
89+
TO logs_service_recommendation
90+
AS
91+
SELECT
92+
CASE WHEN m1 THEN 1 WHEN m2 THEN 2 ELSE 0 END AS TemplateNumber,
93+
CASE when m1 THEN g1_1 ELSE NULL END AS date,
94+
CASE when m1 THEN g1_2 ELSE NULL END AS time,
95+
CASE when m1 THEN g1_3 ELSE NULL END AS service_name,
96+
CASE when m1 THEN g1_4 ELSE NULL END AS trace_sampled,
97+
CASE when m1 THEN g1_5 ELSE g2_1 END AS prod_1,
98+
CASE when m1 THEN g1_6 ELSE g2_2 END AS prod_2,
99+
CASE when m1 THEN g1_7 ELSE g2_3 END AS prod_3,
100+
CASE when m1 THEN g1_8 ELSE g2_4 END AS prod_4,
101+
CASE when m1 THEN g1_9 ELSE g2_5 END AS prod_5
102+
103+
FROM
104+
(
105+
WITH
106+
'^([^\\s]+) ([^\\s]+) INFO \[main\] \[recommendation_server.py:47\] \[trace_id=([^\\s]+) span_id=([^\\s]+) resource\.service\.name=recommendation trace_sampled=True\] - Receive ListRecommendations for product ids:\[([^\\s]+) ([^\\s]+) ([^\\s]+) ([^\\s]+) ([^\\s]+)\]$' AS pattern1,
107+
'^Receive ListRecommendations for product ([^\\s]+) ([^\\s]+) ([^\\s]+) ([^\\s]+) ([^\\s]+)$' AS pattern2
108+
SELECT
109+
*,
110+
match(Body, pattern1) AS m1,
111+
match(Body, pattern2) AS m2,
112+
extractAllGroups(Body, pattern1) AS g1,
113+
extractAllGroups(Body, pattern2) AS g2,
114+
115+
arrayElement(arrayElement(g1, 1), 1) AS g1_1,
116+
arrayElement(arrayElement(g1, 1), 2) AS g1_2,
117+
arrayElement(arrayElement(g1, 1), 3) AS g1_3,
118+
arrayElement(arrayElement(g1, 1), 4) AS g1_4,
119+
arrayElement(arrayElement(g1, 1), 5) AS g1_5,
120+
arrayElement(arrayElement(g1, 1), 6) AS g1_6,
121+
arrayElement(arrayElement(g1, 1), 7) AS g1_7,
122+
arrayElement(arrayElement(g1, 1), 7) AS g1_8,
123+
arrayElement(arrayElement(g1, 1), 7) AS g1_9,
124+
125+
arrayElement(arrayElement(g2, 1), 1) AS g2_1,
126+
arrayElement(arrayElement(g2, 1), 2) AS g2_2,
127+
arrayElement(arrayElement(g2, 1), 3) AS g2_3,
128+
arrayElement(arrayElement(g2, 1), 4) AS g2_4,
129+
arrayElement(arrayElement(g2, 1), 5) AS g2_5
130+
FROM raw_logs where ServiceName='recommendation'
131+
);
132+
133+
-- Create materialized view for cart service logs
134+
CREATE MATERIALIZED VIEW IF NOT EXISTS mv_logs_cart
135+
TO logs_service_cart
136+
AS
137+
SELECT
138+
multiIf(m1, 1, m2, 2, m3, 3, 0) AS TemplateNumber,
139+
multiIf(m1, g1_1, m2, Null, m3, g3_1, m4, g4_1, Null) AS user_id,
140+
multiIf(m1, '', m2, '', m3, g3_2, '') AS product_id,
141+
multiIf(m1, '', m2, '', m3, g3_3, '') AS quantity
142+
143+
FROM
144+
(
145+
WITH
146+
'^[\\s]*GetCartAsync called with userId=([^\\s]*)$' AS pattern1,
147+
'^info\: cart.cartstore.ValkeyCartStore\[0\]$' AS pattern2,
148+
'^[\\s]*AddItemAsync called with userId=([^\\s]+), productId=([^\\s]+), quantity=([^\\s]+)$' AS pattern3,
149+
'^[\\s]*EmptyCartAsync called with userId=([^\\s]*)$' AS pattern4
150+
SELECT
151+
*,
152+
match(Body, pattern1) AS m1,
153+
match(Body, pattern2) AS m2,
154+
match(Body, pattern3) AS m3,
155+
match(Body, pattern4) AS m4,
156+
extractAllGroups(Body, pattern1) AS g1,
157+
extractAllGroups(Body, pattern2) AS g2,
158+
extractAllGroups(Body, pattern3) AS g3,
159+
extractAllGroups(Body, pattern4) AS g4,
160+
161+
arrayElement(arrayElement(g1, 1), 1) AS g1_1,
162+
arrayElement(arrayElement(g3, 1), 1) AS g3_1,
163+
arrayElement(arrayElement(g3, 1), 2) AS g3_2,
164+
arrayElement(arrayElement(g3, 1), 3) AS g3_3,
165+
arrayElement(arrayElement(g4, 1), 1) AS g4_1
166+
FROM raw_logs where ServiceName='cart'
167+
);

0 commit comments

Comments
 (0)