|
| 1 | +# ========================================================================= |
| 2 | +# Copyright (C) 2016-2023 LOGPAI (https://github.com/logpai). |
| 3 | +# |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | +# |
| 10 | +# Unless required by applicable law or agreed to in writing, software |
| 11 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +# See the License for the specific language governing permissions and |
| 14 | +# limitations under the License. |
| 15 | +# ========================================================================= |
| 16 | + |
| 17 | + |
| 18 | +import sys |
| 19 | +sys.path.append("../../") |
| 20 | +from logparser.Logram import LogParser |
| 21 | +from logparser.utils import evaluator |
| 22 | +import os |
| 23 | +import pandas as pd |
| 24 | + |
| 25 | + |
| 26 | +input_dir = "../../data/loghub_2k/" # The input directory of log file |
| 27 | +output_dir = "Logram_result/" # The output directory of parsing results |
| 28 | + |
| 29 | +benchmark_settings = { |
| 30 | + "HDFS": { |
| 31 | + "log_file": "HDFS/HDFS_2k.log", |
| 32 | + "log_format": "<Date> <Time> <Pid> <Level> <Component>: <Content>", |
| 33 | + "regex": [ |
| 34 | + r"blk_(|-)[0-9]+", # block id |
| 35 | + r"(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)", # IP |
| 36 | + r"(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$", |
| 37 | + ], |
| 38 | + "doubleThreshold": 15, |
| 39 | + "triThreshold": 10, |
| 40 | + }, |
| 41 | + "Hadoop": { |
| 42 | + "log_file": "Hadoop/Hadoop_2k.log", |
| 43 | + "log_format": "<Date> <Time> <Level> \[<Process>\] <Component>: <Content>", |
| 44 | + "regex": [r"(\d+\.){3}\d+"], |
| 45 | + "doubleThreshold": 9, |
| 46 | + "triThreshold": 10, |
| 47 | + }, |
| 48 | + "Spark": { |
| 49 | + "log_file": "Spark/Spark_2k.log", |
| 50 | + "log_format": "<Date> <Time> <Level> <Component>: <Content>", |
| 51 | + "regex": [r"(\d+\.){3}\d+", r"\b[KGTM]?B\b", r"([\w-]+\.){2,}[\w-]+"], |
| 52 | + "doubleThreshold": 15, |
| 53 | + "triThreshold": 10, |
| 54 | + }, |
| 55 | + "Zookeeper": { |
| 56 | + "log_file": "Zookeeper/Zookeeper_2k.log", |
| 57 | + "log_format": "<Date> <Time> - <Level> \[<Node>:<Component>@<Id>\] - <Content>", |
| 58 | + "regex": [r"(/|)(\d+\.){3}\d+(:\d+)?"], |
| 59 | + "doubleThreshold": 15, |
| 60 | + "triThreshold": 10, |
| 61 | + }, |
| 62 | + "BGL": { |
| 63 | + "log_file": "BGL/BGL_2k.log", |
| 64 | + "log_format": "<Label> <Timestamp> <Date> <Node> <Time> <NodeRepeat> <Type> <Component> <Level> <Content>", |
| 65 | + "regex": [r"core\.\d+"], |
| 66 | + "doubleThreshold": 92, |
| 67 | + "triThreshold": 4, |
| 68 | + }, |
| 69 | + "HPC": { |
| 70 | + "log_file": "HPC/HPC_2k.log", |
| 71 | + "log_format": "<LogId> <Node> <Component> <State> <Time> <Flag> <Content>", |
| 72 | + "regex": [r"=\d+"], |
| 73 | + "doubleThreshold": 15, |
| 74 | + "triThreshold": 10, |
| 75 | + }, |
| 76 | + "Thunderbird": { |
| 77 | + "log_file": "Thunderbird/Thunderbird_2k.log", |
| 78 | + "log_format": "<Label> <Timestamp> <Date> <User> <Month> <Day> <Time> <Location> <Component>(\[<PID>\])?: <Content>", |
| 79 | + "regex": [r"(\d+\.){3}\d+"], |
| 80 | + "doubleThreshold": 35, |
| 81 | + "triThreshold": 32, |
| 82 | + }, |
| 83 | + "Windows": { |
| 84 | + "log_file": "Windows/Windows_2k.log", |
| 85 | + "log_format": "<Date> <Time>, <Level> <Component> <Content>", |
| 86 | + "regex": [r"0x.*?\s"], |
| 87 | + "doubleThreshold": 15, |
| 88 | + "triThreshold": 10, |
| 89 | + }, |
| 90 | + "Linux": { |
| 91 | + "log_file": "Linux/Linux_2k.log", |
| 92 | + "log_format": "<Month> <Date> <Time> <Level> <Component>(\[<PID>\])?: <Content>", |
| 93 | + "regex": [r"(\d+\.){3}\d+", r"\d{2}:\d{2}:\d{2}"], |
| 94 | + "doubleThreshold": 120, |
| 95 | + "triThreshold": 100, |
| 96 | + }, |
| 97 | + "Android": { |
| 98 | + "log_file": "Android/Android_2k.log", |
| 99 | + "log_format": "<Date> <Time> <Pid> <Tid> <Level> <Component>: <Content>", |
| 100 | + "regex": [ |
| 101 | + r"(/[\w-]+)+", |
| 102 | + r"([\w-]+\.){2,}[\w-]+", |
| 103 | + r"\b(\-?\+?\d+)\b|\b0[Xx][a-fA-F\d]+\b|\b[a-fA-F\d]{4,}\b", |
| 104 | + ], |
| 105 | + "doubleThreshold": 15, |
| 106 | + "triThreshold": 10, |
| 107 | + }, |
| 108 | + "HealthApp": { |
| 109 | + "log_file": "HealthApp/HealthApp_2k.log", |
| 110 | + "log_format": "<Time>\|<Component>\|<Pid>\|<Content>", |
| 111 | + "regex": [], |
| 112 | + "doubleThreshold": 15, |
| 113 | + "triThreshold": 10, |
| 114 | + }, |
| 115 | + "Apache": { |
| 116 | + "log_file": "Apache/Apache_2k.log", |
| 117 | + "log_format": "\[<Time>\] \[<Level>\] <Content>", |
| 118 | + "regex": [r"(\d+\.){3}\d+"], |
| 119 | + "doubleThreshold": 15, |
| 120 | + "triThreshold": 10, |
| 121 | + }, |
| 122 | + "Proxifier": { |
| 123 | + "log_file": "Proxifier/Proxifier_2k.log", |
| 124 | + "log_format": "\[<Time>\] <Program> - <Content>", |
| 125 | + "regex": [ |
| 126 | + r"<\d+\ssec", |
| 127 | + r"([\w-]+\.)+[\w-]+(:\d+)?", |
| 128 | + r"\d{2}:\d{2}(:\d{2})*", |
| 129 | + r"[KGTM]B", |
| 130 | + ], |
| 131 | + "doubleThreshold": 500, |
| 132 | + "triThreshold": 470, |
| 133 | + }, |
| 134 | + "OpenSSH": { |
| 135 | + "log_file": "OpenSSH/OpenSSH_2k.log", |
| 136 | + "log_format": "<Date> <Day> <Time> <Component> sshd\[<Pid>\]: <Content>", |
| 137 | + "regex": [r"(\d+\.){3}\d+", r"([\w-]+\.){2,}[\w-]+"], |
| 138 | + "doubleThreshold": 88, |
| 139 | + "triThreshold": 81, |
| 140 | + }, |
| 141 | + "OpenStack": { |
| 142 | + "log_file": "OpenStack/OpenStack_2k.log", |
| 143 | + "log_format": "<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>", |
| 144 | + "regex": [r"((\d+\.){3}\d+,?)+", r"/.+?\s", r"\d+"], |
| 145 | + "doubleThreshold": 30, |
| 146 | + "triThreshold": 25, |
| 147 | + }, |
| 148 | + "Mac": { |
| 149 | + "log_file": "Mac/Mac_2k.log", |
| 150 | + "log_format": "<Month> <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>", |
| 151 | + "regex": [r"([\w-]+\.){2,}[\w-]+"], |
| 152 | + "doubleThreshold": 2, |
| 153 | + "triThreshold": 2, |
| 154 | + }, |
| 155 | +} |
| 156 | + |
| 157 | +bechmark_result = [] |
| 158 | +for dataset, setting in benchmark_settings.items(): |
| 159 | + print("\n=== Evaluation on %s ===" % dataset) |
| 160 | + indir = os.path.join(input_dir, os.path.dirname(setting["log_file"])) |
| 161 | + log_file = os.path.basename(setting["log_file"]) |
| 162 | + |
| 163 | + parser = LogParser( |
| 164 | + log_format=setting["log_format"], |
| 165 | + indir=indir, |
| 166 | + outdir=output_dir, |
| 167 | + rex=setting["regex"], |
| 168 | + doubleThreshold=setting["doubleThreshold"], |
| 169 | + triThreshold=setting["triThreshold"], |
| 170 | + ) |
| 171 | + parser.parse(log_file) |
| 172 | + |
| 173 | + F1_measure, accuracy = evaluator.evaluate( |
| 174 | + groundtruth=os.path.join(indir, log_file + "_structured.csv"), |
| 175 | + parsedresult=os.path.join(output_dir, log_file + "_structured.csv"), |
| 176 | + ) |
| 177 | + bechmark_result.append([dataset, F1_measure, accuracy]) |
| 178 | + |
| 179 | +print("\n=== Overall evaluation results ===") |
| 180 | +df_result = pd.DataFrame(bechmark_result, columns=["Dataset", "F1_measure", "Accuracy"]) |
| 181 | +df_result.set_index("Dataset", inplace=True) |
| 182 | +print(df_result) |
| 183 | +df_result.to_csv("Logram_bechmark_result.csv", float_format="%.6f") |
0 commit comments