ee_visualizer/csv_to_json.py at main · 04ina/ee_visualizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import csv
import json
import sys

def parse_field(val, target_type, null_repr='\\N'):
    if val == null_repr:
        return None
    return target_type(val)

def parse_cost_field(val):
    """Parse cost field, handling Infinity and NaN values for JSON compatibility"""
    if val == '\\N':
        return None
    elif val.upper() == 'INFINITY' or val.upper() == 'INF':
        return float('inf')
    elif val.upper() == 'NAN':
        return float('nan')
    else:
        return float(val)

def main(csv_path):
    COLUMNS = [
        "query_id", "subquery_id", "subquery_level", "rel_id", "path_id",
        "path_type", "child_paths", "startup_cost", "total_cost", "rows",
        "width", "rel_name", "rel_alias", "indexoid", "level",
        "add_path_result", "displaced_by", "cost_cmp", "fuzz_factor", "pathkeys_cmp",
        "bms_cmp", "rows_cmp", "parallel_safe_cmp", "disabled_nodes", "relids"
    ]

    paths = []
    with open(csv_path, newline='') as f:
        reader = csv.reader(f, delimiter='\t')
        for row in reader:
            record = dict(zip(COLUMNS, row))

            def clean(val):
                return None if val == '\\N' else val

            try:
                # Преобразуем level: если \N — используем 0 по умолчанию
                level_val = record["level"]
                level = int(level_val) if level_val != '\\N' else 0

                path_entry = {

                    "query_id": parse_field(record["query_id"], int),
                    "subquery_id": parse_field(record["subquery_id"], int) or 1,
                    "subquery_level": parse_field(record["subquery_level"], int),
                    "level": level,
                    "rel_id": parse_field(record["rel_id"], int),
                    "path_id": parse_field(record["path_id"], int),
                    "path_type": clean(record["path_type"]),
                    "child_paths": [] if record["child_paths"] == '\\N' else [
                        int(x.strip()) for x in record["child_paths"].strip("{}").split(",") if x.strip()
                    ],
                    "rel_name": clean(record["rel_name"]),
                    "rel_alias": clean(record["rel_alias"]),
                    "startup_cost": parse_cost_field(record["startup_cost"]),
                    "total_cost": parse_cost_field(record["total_cost"]),
                    "rows": parse_field(record["rows"], int),
                    "width": parse_field(record["width"], int),
                    "indexoid": parse_field(record["indexoid"], int),
                    "add_path_result": clean(record["add_path_result"]),
                    "displaced_by": parse_field(record["displaced_by"], int),
                    "cost_cmp": clean(record["cost_cmp"]),
                    "fuzz_factor": parse_field(record["fuzz_factor"], float),
                    "pathkeys_cmp": clean(record["pathkeys_cmp"]),
                    "bms_cmp": clean(record["bms_cmp"]),
                    "rows_cmp": clean(record["rows_cmp"]),
                    "parallel_safe_cmp": clean(record["parallel_safe_cmp"]),
                    "disabled_nodes": clean(record["disabled_nodes"]),
                    "relids": clean(record["relids"]),
                }
                paths.append(path_entry)
            except Exception as e:
                print(f"Пропущена строка: {row} ({e})", file=sys.stderr)
                continue

    # Группировка: queries -> subqueries -> relations (без levels)
    data = {"queries": {}}
    for p in paths:
        qid = p["query_id"]
        sid = p["subquery_id"]
        rid = p["rel_id"]

        queries = data["queries"]
        if qid not in queries:
            queries[qid] = {"subqueries": {}}
        subqueries = queries[qid]["subqueries"]
        if sid not in subqueries:
            subqueries[sid] = {"relations": {}}
        relations = subqueries[sid]["relations"]
        if rid not in relations:
            relations[rid] = {
                "name": p["rel_name"],
                "alias": p["rel_alias"],
                "paths": []
            }
        # Добавляем поле level в сам путь, так как теперь у нас нет уровней в структуре
        p_with_level = p.copy()
        p_with_level["level"] = p_with_level.get("level", 1)  # Убедимся, что level есть в каждом пути
        relations[rid]["paths"].append(p_with_level)

    # Function to recursively replace Infinity and NaN in the data structure
    def replace_special_floats(obj):
        if isinstance(obj, dict):
            return {key: replace_special_floats(value) for key, value in obj.items()}
        elif isinstance(obj, list):
            return [replace_special_floats(item) for item in obj]
        elif isinstance(obj, float):
            if obj != obj:  # NaN check
                return "NaN"
            elif obj == float('inf'):
                return "Infinity"
            elif obj == float('-inf'):
                return "-Infinity"  # Although you mentioned not needing this, keeping for completeness
        return obj

    # Replace special float values with string representations
    processed_data = replace_special_floats(data)

    # Вывод JSON
    print(json.dumps(processed_data, indent=2))


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Использование: python3 csv_to_json.py <file.csv>", file=sys.stderr)
        sys.exit(1)
    main(sys.argv[1])