Skip to content

Commit adfca4d

Browse files
authored
Merge pull request #7370 from chaen/v8.0_feat_consistency_checks
[v8.0] Add consistency check tools
2 parents 3081d48 + a25cc56 commit adfca4d

File tree

3 files changed

+282
-4
lines changed

3 files changed

+282
-4
lines changed

consistency_check/README.md

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
# Consistency check
2+
3+
this script is here to help compare storage and DFC dumps.
4+
5+
## What you need
6+
7+
### SE definitions
8+
9+
A CSV file containing its name and base path. Like
10+
11+
```
12+
CSCS-DST;/pnfs/lcg.cscs.ch/lhcb
13+
CSCS_MC-DST;/pnfs/lcg.cscs.ch/lhcb
14+
```
15+
16+
You can obtain it with something like
17+
18+
```python
19+
from DIRAC import initialize
20+
initialize()
21+
from DIRAC import gConfig
22+
from DIRAC.Resources.Storage.StorageElement import StorageElement
23+
24+
for se in gConfig.getSections("/Resources/StorageElements")["Value"]:
25+
print(f"{se};{list(StorageElement(se).storages.values())[0].basePath}")
26+
```
27+
28+
### StorageElement dump
29+
30+
This is typically provided by the site, and we expect just a flat list of the files
31+
32+
```
33+
/pnfs/lcg.cscs.ch/lhcb/generated/2013-07-07/fileeed071eb-1aa0-4d00-8775-79624737224e
34+
/pnfs/lcg.cscs.ch/lhcb/generated/2013-07-10/fileed08b040-196c-46d9-b4d6-37d80cba27eb
35+
/pnfs/lcg.cscs.ch/lhcb/lhcb/test/SAM/testfile-put-LHCb-Disk-1494915199-61e6d085bb84.txt
36+
```
37+
38+
### Catalog dump(s)
39+
40+
Ideally, you should have two catalog dumps for the SE that you are concerned about: one before the SE dump, and one after. Having only one of the two only allows to get partial comparison
41+
42+
You could get it with a script like
43+
44+
```python
45+
import sys
46+
from datetime import datetime,timezone
47+
from DIRAC import initialize
48+
initialize()
49+
from DIRAC import gConfig
50+
from DIRAC.Resources.Catalog.FileCatalogClient import FileCatalogClient
51+
dfc = FileCatalogClient()
52+
53+
# Something like LCG.CERN.ch
54+
site_name = sys.argv[1]
55+
56+
ses = gConfig.getOption(f"/Resources/Sites/{site_name.split('.')[0]}/{site_name}/SE",[])["Value"]
57+
58+
timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
59+
output_file = f"{site_name}_dfc_{timestamp}.dump"
60+
print(f"Getting FC dump for {ses} in {output_file}")
61+
res = dfc.getSEDump(ses, output_file)
62+
print(res)
63+
```
64+
65+
66+
Or from a `BaseSE`
67+
68+
```python
69+
#!/usr/bin/env python3
70+
71+
import sys
72+
from datetime import datetime,timezone
73+
from DIRAC import initialize
74+
initialize()
75+
from DIRAC import gConfig
76+
from DIRAC.Resources.Catalog.FileCatalogClient import FileCatalogClient
77+
dfc = FileCatalogClient()
78+
79+
# Something like RAL-ECHO
80+
base_se_name = sys.argv[1]
81+
82+
ses = []
83+
ses_data = gConfig.getOptionsDictRecursively(f"/Resources/StorageElements")["Value"]
84+
for key, val in ses_data.items():
85+
try:
86+
if val['BaseSE'] == base_se_name:
87+
ses.append(key)
88+
except (KeyError, TypeError):
89+
pass
90+
91+
timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
92+
output_file = f"{base_se_name}_dfc_{timestamp}.dump"
93+
print(f"Getting FC dump for {ses} in {output_file}")
94+
res = dfc.getSEDump(ses, output_file)
95+
print(res)
96+
```
97+
98+
## How it works
99+
100+
We look at the differences and the intersections between the dump of the old catalog, the new catalog, and the storage element.
101+
102+
For example, you find dark data by looking at files that are in the SE dump, but not in any of the catalog dump. Lost data is data that is in both catalog dump, but not in the SE dump.
103+
104+
105+
| Old FC | New FC | SE | Status |
106+
|--------|--------|----|------------------|
107+
| 0 | 0 | 1 | Dark data |
108+
| 0 | 1 | 0 | Very new |
109+
| 0 | 1 | 1 | New |
110+
| 1 | 0 | 0 | Deleted |
111+
| 1 | 0 | 1 | Recently deleted |
112+
| 1 | 1 | 0 | Lost file |
113+
| 1 | 1 | 1 | OK |
114+
115+
## How to use
116+
117+
Although you probably need DIRAC to be able to get the DFC dump or the SE config, you do not need DIRAC installed once you have all the `csv` files.
118+
You will however need `pandas` and `typer`
119+
120+
121+
The `consistency` script has 3 commands:
122+
* `threeways`: do a proper comparison of 1 old DFC dump, one SE dump, one new DFC dump. Results are as good as it gets
123+
* `possibly-dark-data`: Tries to find dark data but be careful of the result (see `help`).
124+
* `possibly-lost-data`: Tries to find lost data but be careful of the result (see `help`).
125+
126+
In any case, you should check the output with commands like `dirac-dms-replica-stats` or `dirac-dms-pfn-exists`.

consistency_check/consistency.py

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/usr/bin/env python
2+
import pandas as pd
3+
import typer
4+
from pathlib import Path
5+
from typer import colors
6+
from typing import Annotated
7+
8+
9+
RED = colors.RED
10+
GREEN = colors.GREEN
11+
12+
app = typer.Typer()
13+
14+
15+
def load_se_definition(se_def_path):
16+
return pd.read_csv(se_def_path, names=["seName", "basePath"], delimiter=";", index_col="seName")
17+
18+
19+
def load_dfc_dump(dfc_dump_path, version):
20+
fc_dump = pd.read_csv(dfc_dump_path, names=["seName", "lfn", "cks", "size"], delimiter="|")
21+
fc_dump["version"] = version
22+
return fc_dump
23+
24+
25+
def load_se_dump(se_dump_path):
26+
se_dump = pd.read_csv(se_dump_path, names=["pfn"], delimiter=";", index_col="pfn")
27+
se_dump["version"] = "se_dump"
28+
return se_dump
29+
30+
31+
@app.command()
32+
def possibly_lost_data(
33+
fc_dump_file: Annotated[Path, typer.Option(help="DFC dump AFTER the SE dump")],
34+
se_def_file: Annotated[Path, typer.Option(help="Definition of the SE path")],
35+
se_dump_file: Annotated[Path, typer.Option(help="Dump of the SE")],
36+
lost_file_output: Annotated[Path, typer.Option(help="Output file in which to dump lost")] = "lost.csv",
37+
):
38+
"""
39+
DANGER: make a partial comparison of an SE dump and an FC dump to find lost data
40+
Be careful because you can't trust the result:
41+
* if the FC dump is more recent than the SE dump, you may get files that were added on the SE after the dump
42+
* if the FC dump is older than the SE dump, the file may have been purposedly removed
43+
"""
44+
se_dump = load_se_dump(se_dump_file)
45+
se_def = load_se_definition(se_def_file)
46+
47+
# Compute the PFN for each LFN in the DFC dump
48+
49+
fc_dump = load_dfc_dump(fc_dump_file, "fc")
50+
fc_dump = pd.merge(fc_dump, se_def, on="seName")
51+
fc_dump["pfn"] = fc_dump["basePath"] + fc_dump["lfn"]
52+
fc_dump.set_index("pfn", inplace=True)
53+
54+
# Lost files: in both FC dump but not in the SE
55+
56+
lostData = fc_dump.index.difference(se_dump.index)
57+
if len(lostData):
58+
typer.secho(f"Found {len(lostData)} lost files, dumping them in {lost_file_output}", err=True, fg=RED)
59+
lastDataDetail = fc_dump[fc_dump.index.isin(lostData)]
60+
lastDataDetail.to_csv(lost_file_output)
61+
else:
62+
typer.secho("No dark data found", fg=GREEN)
63+
64+
65+
@app.command()
66+
def possibly_dark_data(
67+
fc_dump_file: Annotated[Path, typer.Option(help="DFC dump")],
68+
se_def_file: Annotated[Path, typer.Option(help="Definition of the SE path")],
69+
se_dump_file: Annotated[Path, typer.Option(help="Dump of the SE")],
70+
dark_file_output: Annotated[Path, typer.Option(help="Output file in which to dump dark data")] = "dark.csv",
71+
):
72+
"""
73+
DANGER: make a partial comparison of an SE dump and an FC dump to find dark data.
74+
Be careful because you can't trust the result:
75+
* if the FC dump is more recent than the SE dump, you may get files that were already removed
76+
* if the FC dump is older than the SE dump, you may find files that were added properly after the dump (DANGER)
77+
"""
78+
se_dump = load_se_dump(se_dump_file)
79+
se_def = load_se_definition(se_def_file)
80+
81+
# Compute the PFN for each LFN in the DFC dump
82+
83+
fc_dump = load_dfc_dump(fc_dump_file, "fc")
84+
fc_dump = pd.merge(fc_dump, se_def, on="seName")
85+
fc_dump["pfn"] = fc_dump["basePath"] + fc_dump["lfn"]
86+
fc_dump.set_index("pfn", inplace=True)
87+
88+
# Dark data: in the SE dump but not in any of the FC dump
89+
90+
typer.echo(f"Computing dark data")
91+
# Find the dark data
92+
darkData = se_dump.index.difference(fc_dump.index)
93+
94+
if len(darkData):
95+
typer.secho(f"Found {len(darkData)} dark data, dumping them in {dark_file_output}", err=True, fg=RED)
96+
pd.DataFrame(index=darkData).to_csv(dark_file_output)
97+
else:
98+
typer.secho("No dark data found", fg=GREEN)
99+
100+
101+
@app.command()
102+
def threeway(
103+
old_fc_dump_file: Annotated[Path, typer.Option(help="DFC dump BEFORE the SE dump")],
104+
new_fc_dump_file: Annotated[Path, typer.Option(help="DFC dump AFTER the SE dump")],
105+
se_def_file: Annotated[Path, typer.Option(help="Definition of the SE path")],
106+
se_dump_file: Annotated[Path, typer.Option(help="Dump of the SE")],
107+
lost_file_output: Annotated[Path, typer.Option(help="Output file in which to dump lost files")] = "lost.csv",
108+
dark_file_output: Annotated[Path, typer.Option(help="Output file in which to dump dark data")] = "dark.csv",
109+
):
110+
"""
111+
Make a full comparison of two FC dump and one SE dump
112+
"""
113+
se_dump = load_se_dump(se_dump_file)
114+
se_def = load_se_definition(se_def_file)
115+
116+
# Compute the PFN for each LFN in the DFC dump
117+
old_fc_dump = load_dfc_dump(old_fc_dump_file, "old_fc")
118+
old_fc_dump = pd.merge(old_fc_dump, se_def, on="seName")
119+
old_fc_dump["pfn"] = old_fc_dump["basePath"] + old_fc_dump["lfn"]
120+
old_fc_dump.set_index("pfn", inplace=True)
121+
122+
new_fc_dump = load_dfc_dump(new_fc_dump_file, "new_fc")
123+
new_fc_dump = pd.merge(new_fc_dump, se_def, on="seName")
124+
new_fc_dump["pfn"] = new_fc_dump["basePath"] + new_fc_dump["lfn"]
125+
new_fc_dump.set_index("pfn", inplace=True)
126+
127+
# Dark data: in the SE dump but not in any of the FC dump
128+
129+
typer.echo(f"Computing dark data")
130+
# Find the dark data
131+
darkData = se_dump.index.difference(old_fc_dump.index.union(new_fc_dump.index))
132+
133+
if len(darkData):
134+
typer.secho(f"Found {len(darkData)} dark data, dumping them in {dark_file_output}", err=True, fg=RED)
135+
pd.DataFrame(index=darkData).to_csv(dark_file_output)
136+
else:
137+
typer.secho("No dark data found", fg=GREEN)
138+
139+
# Lost files: in both FC dump but not in the SE
140+
141+
lostData = (old_fc_dump.index.intersection(new_fc_dump.index)).difference(se_dump.index)
142+
if len(lostData):
143+
typer.secho(f"Found {len(lostData)} lost files, dumping them in {lost_file_output}", err=True, fg=RED)
144+
lastDataDetail = new_fc_dump[new_fc_dump.index.isin(lostData)]
145+
lastDataDetail.to_csv(lost_file_output)
146+
else:
147+
typer.secho("No dark data found", fg=GREEN)
148+
149+
150+
if __name__ == "__main__":
151+
app()

docs/diracdoctools/scripts/dirac-docs-get-release-notes.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,11 @@ def gitlabSetup(GITLABTOKEN=""):
9292
SESSION.headers.update({"PRIVATE-TOKEN": GITLABTOKEN})
9393

9494

95-
def req2Json(url, parameterDict=None, requestType="GET"):
95+
def req2Json(url, parameterDict=None, requestType="GET", queryParameters=None):
9696
"""Call to github API using requests package."""
9797
log = LOGGER.getChild("Requests")
9898
log.debug("Running %s with %s ", requestType, parameterDict)
99-
req = getattr(SESSION, requestType.lower())(url, json=parameterDict)
99+
req = getattr(SESSION, requestType.lower())(url, json=parameterDict, params=queryParameters)
100100
if req.status_code not in (200, 201):
101101
log.error("Unable to access API: %s", req.text)
102102
raise RuntimeError("Failed to access API")
@@ -433,7 +433,7 @@ def parseOptions(self):
433433
for var, val in sorted(vars(parsed).items()):
434434
log.info("Using options: %s = %s", var, pformat(val))
435435

436-
def _github(self, action):
436+
def _github(self, action, per_page=None):
437437
"""Return the url to perform actions on github.
438438
439439
:param str action: command to use in the gitlab API, see documentation there
@@ -442,6 +442,7 @@ def _github(self, action):
442442
log = LOGGER.getChild("GitHub")
443443
options = dict(self._options)
444444
options["action"] = action
445+
445446
ghURL = f"https://api.github.com/repos/{options['owner']}/{options['repo']}/{options['action']}"
446447
log.debug("Calling: %s", ghURL)
447448
return ghURL
@@ -504,7 +505,7 @@ def getGithubLatestTagDate(self, sinceTag):
504505
log = LOGGER.getChild("getGithubLatestTagDate")
505506

506507
# Get all tags
507-
tags = req2Json(url=self._github("tags"))
508+
tags = req2Json(url=self._github("tags"), queryParameters={"per_page": 100})
508509
if isinstance(tags, dict) and "Not Found" in tags.get("message"):
509510
raise RuntimeError(f"Package not found: {str(self)}")
510511

0 commit comments

Comments
 (0)