gaston/testcheck.py at master · tfiedor/gaston · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
#!/usr/bin/env python2
"""
    WSkS Test Bench

    @author: Tomas Fiedor, ifiedortom@fit.vutbr.cz
    @summary: Test Bench script for running several benchmarks on binaries

"""

import argparse
from datetime import datetime
import itertools
import os
import re
import subprocess
import sys
from threading import Timer
from termcolor import colored

dwina_error = -1
timeout_error = -2
mona_error = -1
mona_expnf_error = -2
time_error = 0.1        # error in percents for timing
reference_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tests", "perf", "reference.perf")


def createArgumentParser():
    """
    Creates Argument Parser object
    """
    parser = argparse.ArgumentParser("WSkS Test Check")
    parser.add_argument('--dir', '-d', default="basic", help="directory with benchmarks")
    parser.add_argument('--timeout', '-t', default=None, help='timeouts in minutes')
    parser.add_argument('--check', '-c', action='store_true', help='run the regression testing')
    parser.add_argument('--check-mem', '-cm', action='store_true', help='runs the valgrind during testing as well')
    parser.add_argument('--check-perf', action='store_true', help='run the performance testing')
    return parser


def run_mona(test, timeout, checkonly=False):
    """
    Runs MONA with following arguments:
    """
    args = ('mona', '-s', '"{}"'.format(test))
    output, retcode = runProcess(args, timeout)
    if(retcode != 0):
        return mona_error, output
    return parseMonaOutput(output, False, checkonly)


def run_gaston(test, timeout, checkonly=False):
    """
    Runs dWiNA with following arguments: --method=backward
    """
    gaston_bin = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'build/gaston')
    args = (gaston_bin, '--verify', '"{}"'.format(test))
    output, retcode = runProcess(args, timeout)

    # Fixme: This should be the issue of segfault
    if retcode != 0:
        if retcode == 124:
            return timeout_error, ""
        else:
            return dwina_error, ""
    return parsedWiNAOutput(output, "", checkonly)


def run_valgrind(test, timeout):
    '''
    Runs gaston on valgrind to check for leaks
    :param test:
    :param timeout:
    :return:
    '''
    args = ('valgrind', './gaston', '"{}"'.format(test))
    output, retcode = runProcess(args, timeout, True)

    return parse_valgrind_output(output)


def runProcess(args, timeout, from_error=False):
    '''
    Opens new subprocess and runs the arguments

    @param: arguments to be run in subprocess
    @return read output
    '''
    if from_error:
        proc = subprocess.Popen(" ".join(args), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        output = proc.stderr.readlines()
        proc.wait()
        return output, proc.returncode
    else:
        timeout = "timeout {0}m".format(timeout) if (timeout is not None) else None
        if timeout is None:
            proc = subprocess.Popen(" ".join(args), shell=True, stdout=subprocess.PIPE)
        else:
            proc = subprocess.Popen(" ".join((timeout, ) + args), shell=True, stdout=subprocess.PIPE)
        output = proc.stdout.readlines()
        proc.wait()
        return output, proc.returncode


def parseMonaOutput(output, isExPNF, checkonly=False):
    '''
    Gets mona or mona-expnf output, strips all whitespaces from start then
    gets a line with "Total time:", parses the time in seconds, in float,
    filters out all the automata until first projection and then gets all
    automata that are minimized and summed up to get the number of states that
    mona generates

    @param output: lines with mona output
    '''
    strippedLines = [line.lstrip() for line in output]

    ret = ""
    for line in strippedLines:
        match = re.search("Formula is ([a-zA-Z]+)", line)
        if match is not None:
            ret = match.group(1)
            break
        match = re.search("A satisfying example", line)
        if match is not None:
            ret = "satisfiable"
            break

    return (-1, -1), ret


def parse_total_time(line):
    """
    @param line: time line in format 'Total time: 00:00:00.00'
    @return: time in seconds, in float
    """
    match = re.search("([0-9][0-9]):([0-9][0-9]):([0-9][0-9].[0-9][0-9])", line)
    time = 3600*float(match.group(1)) + 60*float(match.group(2)) + float(match.group(3))
    return time if time != 0 else 0.01


def parsedWiNAOutput(output, unprunedOutput, checkonly=False):
    """

    @param output: lines with dwina output
    """
    stripped_lines = [line.lstrip() for line in output]
    ret = ""
    time = -1
    for line in stripped_lines:
        match = re.search("\[!\] Formula is [^']*'([A-Z]+)'[^']*", line)
        if match is not None:
            ret = match.group(1)
            break
    for line in stripped_lines:
        match = re.search("\[*\] Total elapsed time: ", line)
        if match is not None:
            time = parse_total_time(line)
            break
    for line in stripped_lines:
        match = re.search("INCORRECT", line)
        if match is not None:
            ret = 'incorrect'
    return time, ret


def parse_valgrind_output(output):
    """

    :param output:
    :return:
    """
    stripped_lines = [line.lstrip() for line in output]
    for line in stripped_lines:
        match = re.search(".*total heap usage: ([0-9]*(,[0-9]*)*) allocs, ([0-9]*(,[0-9]*)*) frees, ([0-9]*(,[0-9]*)*) bytes allocated", line)
        if match is not None:
            return match.group(1), match.group(3), match.group(5)


def parse_arguments():
    """
    Parse input arguments
    """
    parser = createArgumentParser()
    if len(sys.argv) == 0:
        parser.print_help()
        quit()
    else:
        opt = parser.parse_args()
        if opt.check_mem is True and opt.check_perf is True:
            print("Conflicting options --check-mem and --check-perf")
            parser.print_help()
            quit()
        elif opt.check_perf and opt.timeout is None:
            opt.timeout = "1"
        return opt


def get_benchmark_dir(opt):
    """
    Returns the dir with the benchmarks according to the set options

    :param opt  options of testcheck.py
    """
    benchmark_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tests",
                                 opt.dir if opt.check_perf is False else "perf")
    return benchmark_dir


def load_performance_times():
    """
    :returns dictionary for benchmarks and times
    """
    with open(reference_file, 'r') as p:
        dict = {line.split(":")[0]: float(line.split(":")[1].strip()) for line in p.readlines() if ":" in line}
        return dict


def check_times(time, reference_time):
    """
    :returns (is performance degradation, is performance gain)
    """
    lower_bound = reference_time*(1-time_error)
    upper_bound = reference_time*(1+time_error)

    if time == -2:
        if reference_time == -2:
            return True, False, reference_time, reference_time
        else:
            return True, False, lower_bound, upper_bound

    if time > upper_bound:
        return True, False, lower_bound, upper_bound
    if time < lower_bound:
        return False, True, lower_bound, upper_bound
    else:
        return False, False, lower_bound, upper_bound


if __name__ == '__main__':
    print("[*] WSkS Test Bench")
    print("[c] Tomas Fiedor, ifiedortom@fit.vutbr.cz")

    options = parse_arguments()

    # we will generate stuff
    data = {}
    performance_reference = load_performance_times()

    # modification and setup of parameters
    bins = ['mona', 'gaston']
    wdir = get_benchmark_dir(options)

    # iterate through all files in dir
    cases = 0
    all_cases = []
    fails = 0
    failed_cases = []
    for root, dirs, filenames in os.walk(wdir):
        for f in filenames:
            benchmark = os.path.join(root, f)
            data[benchmark] = {}
            if not benchmark.endswith('.mona'):
                continue
            if benchmark.endswith('_unsatisfying.mona') or benchmark.endswith('_satisfying.mona'):
                continue

            print("[*] Running test bench:"),
            print(colored("'{}'".format(benchmark), "white", attrs=["bold"]))
            rets = {'gaston': ""}
            for bin in bins:
                method_name = "_".join(["run"] + bin.split('-'))
                method_call = getattr(sys.modules[__name__], method_name)
                data[benchmark][bin], rets[bin] = method_call(benchmark, options.timeout, options.check)
            allocs, frees, bytes = "", "", ""
            if options.check_mem:
                allocs, frees, bytes = run_valgrind(benchmark, options.timeout)
            cases += 1
            if options.check_perf:
                print("\t->"),
                try:
                    is_degradation, is_gain, lower_bound, upper_bound \
                        = check_times(data[benchmark]['gaston'], performance_reference[benchmark])
                    if is_degradation:
                        print(colored("FAIL", "red")),
                        print(":"),
                        print(colored(data[benchmark]['gaston'], "red")),
                        print(" degrades from {0:.3f}".format(upper_bound))
                        fails += 1
                    else:
                        print(colored("OK", "green")),
                        print(":"),
                        print(colored(data[benchmark]['gaston'], "green")),
                        print(" within bounds ({0:.3f}, {1:.3f})".format(lower_bound, upper_bound))
                        performance_reference[benchmark] = \
                            (performance_reference[benchmark] + data[benchmark]['gaston']) / 2
                except KeyError as k:
                    print(colored("UNKNOWN", "yellow")),
                    print(":"),
                    print(data[benchmark]['gaston'])
                    performance_reference[benchmark] = data[benchmark]['gaston']
            elif rets['mona'] == -1 or rets['mona'] == "":
                print("\t-> MONA failed or could not be determined")
            elif rets['mona'].upper() != rets['gaston']:
                print("\t->"),
                print(colored("FAIL", "red")),
                print("; Formula is "),
                print(colored("'{}'".format(rets['mona']), "white")),
                print(" (gaston returned "),
                print(colored("'{}'".format(rets['gaston'].lower()), "white")),
                print(")")
                fails += 1
                failed_cases.append("'{}': gaston ('{}') vs mona ('{}')".format(benchmark, rets['gaston'], rets['mona'].lower()))
                all_cases.append("FAIL : '{}': gaston ('{}') vs mona ('{}')".format(benchmark, rets['gaston'], rets['mona'].lower()))
            elif allocs != frees:
                print("\t->"),
                print(colored("FAIL", "red")),
                print("; Memcheck found leaks: "),
                print(" {} allocs / {} frees (out of {} allocated)".format(allocs, frees, bytes))
                fails += 1
                failed_cases.append("'{}': gaston leaks: {} allocs / {} frees".format(benchmark, allocs, frees))
                all_cases.append("FAIL : '{}': gaston leaks: {} allocs / {} frees".format(benchmark, allocs, frees))
            else:
                all_cases.append("OK : '{}': gaston ('{}') vs mona ('{}')".format(benchmark, rets['gaston'], rets['mona'].lower()))
                print("\t->"),
                print(colored("OK", "green")),
                print("; Formula is"),
                print(colored("'{}'".format(rets['mona']), "white")),
                print(" ({} allocated)".format(allocs, frees, bytes))

    print("[*] Running statistics of tests:")
    print("[!] "),
    clr = "red" if cases-fails != cases else "green"
    print(colored("{0}/{1} passes".format(cases-fails, cases), clr, attrs=["bold"]))
    print("[!] Regression tests "),
    with open('testbench.log', 'w') as ac_file:
        ac_file.write("\n".join(all_cases))
    with open(reference_file, 'w') as ref_file:
        ref_file.write("\n".join("{}:{}".format(key, value) for (key, value) in performance_reference.items()))
    if cases-fails != cases:
        print(colored("failed", "red", attrs=["bold"]))
        print("[!] Saving failed cases to :"),
        print(colored("'testbench-fail.log'", "grey", attrs=["bold"]))
        with open('testbench-fail.log', 'w') as fc_file:
            fc_file.write("\n".join(failed_cases))
    else:
        print(colored("passed", "green", attrs=["bold"]))
    if cases-fails != cases:
        exit(1)
    else:
        exit(0)