-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path01_initiale_aufbereitung.py
More file actions
69 lines (53 loc) · 1.85 KB
/
01_initiale_aufbereitung.py
File metadata and controls
69 lines (53 loc) · 1.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import json
import pandas as pd
import glob
import os
# Ordner mit JSON-Dateien
json_folder = "output/"
rows_label = []
rows_page = []
# Alle JSON-Dateien im Ordner sammeln
for filepath in glob.glob(os.path.join(json_folder, "*.json")):
with open(filepath, "r") as f:
data = json.load(f)
structures = data.get("structures", [])
for struct in structures:
try:
nummern = [int(nummer.split("/")[-1]) for nummer in struct["canvases"]]
label = struct["label"]
rows_label.append(
{
"source_file": os.path.basename(filepath),
"label": label,
"nummern": nummern,
}
)
except Exception:
pass
sequences = data.get("sequences", [])
for canvas in sequences[0]["canvases"]:
try:
id = canvas["@id"].split("/")[-1]
page = canvas["label"]
rows_page.append(
{
"source_file": os.path.basename(filepath),
"nummern": int(id),
"page": page,
}
)
except Exception:
pass
# DataFrame bauen
df_labels = (
pd.DataFrame(rows_label)
.explode("nummern")
.drop_duplicates(subset=["source_file", "nummern"], keep="last")
.reset_index(drop=True)
)
df_page = pd.DataFrame(rows_page).drop_duplicates("nummern", keep="last")
df_csv = pd.read_csv("input/text_export_final.csv", sep=";", encoding="utf-8")
df_csv_merged = df_csv.merge(df_labels, left_on="path", right_on="nummern", how="left")
df_csv_merged = df_csv_merged.merge(df_page[["nummern", "page"]], on="nummern", how="left")
df_csv_merged = df_csv_merged.drop("nummern", axis=1)
df_csv_merged.to_parquet("input/input_complete_dkz.parquet")