-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdump_from_old_taiga.py
More file actions
117 lines (100 loc) · 3.4 KB
/
dump_from_old_taiga.py
File metadata and controls
117 lines (100 loc) · 3.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
from sqlalchemy import create_engine
cache_engine = create_engine(
"sqlite:///" + os.path.abspath("summary_cache.db"), echo=True
)
cache_conn = cache_engine.connect()
engine = create_engine("sqlite:///" + os.path.abspath("metadata.sqlite3"), echo=True)
conn = engine.connect()
def get_summary(filename):
key = "/xchip/datasci/data/taiga/" + filename
row = cache_conn.execute(
"select summary from summary where path = ?", [key]
).fetchone()
if row is None:
print("could not find", key)
return "unknown"
return row[0]
import csv
w_ds = open("datasets.csv", "wt")
w_ds_csv = csv.writer(w_ds)
w_dv = open("dataset_versions.csv", "wt")
w_dv_csv = csv.writer(w_dv)
def get_version_desc(permaname, version):
print("fetching", permaname, version)
return conn.execute(
"select v.description, u.email from data_version v join named_data nd on nd.named_data_id = v.named_data_id left outer join user u on u.user_id = v.created_by_user_id where nd.permaname = ? and v.version = ?",
[permaname, version],
).fetchone()
w_ds_csv.writerow(["name", "permaname", "description", "folder"])
w_dv_csv.writerow(
[
"permaname",
"id",
"version",
"type",
"short_desc",
"created_by",
"created_timestamp",
"s3_location",
]
)
for name, permaname, is_public, latest_version in conn.execute(
"select name, permaname, is_public, latest_version from named_data"
).fetchall():
if len(permaname) == 0:
continue
# , "Dataset {} has no permaname".format(name)
description, created_by = get_version_desc(permaname, latest_version)
if permaname.startswith("achilles"):
folder = "achilles"
elif is_public:
if permaname.startswith("ccle"):
folder = "ccle"
else:
folder = "public"
else:
if created_by is None and permaname.startswith("avana"):
folder = "achilles"
else:
if created_by is None:
created_by = "pmontgom@broadinstitute.org"
folder = "home({})".format(created_by)
w_ds_csv.writerow([name, permaname, description, folder])
dv_count = 0
for (
dataset_id,
version,
hdf5_path,
columnar_path,
created_by,
created_timestamp,
) in conn.execute(
"select dataset_id, version, hdf5_path, columnar_path, u.email, created_timestamp from data_version v join named_data nd on nd.named_data_id = v.named_data_id left outer join user u on u.user_id = v.created_by_user_id where nd.permaname = ? order by v.version",
[permaname],
).fetchall():
if hdf5_path is not None:
df_type = "hdf5"
filename = dataset_id + ".hdf5"
else:
df_type = "columnar"
filename = dataset_id + ".columnar"
short_desc = get_summary(filename)
if created_by is None:
created_by = "unowned-from-taiga1@broadinstitute.org"
w_dv_csv.writerow(
[
permaname,
dataset_id,
version,
df_type,
short_desc,
created_by,
created_timestamp,
"s3://taiga2/migrated/" + filename,
]
)
dv_count += 1
assert dv_count > 0, "{} has no versions".format(permaname)
w_ds.close()
w_dv.close()