Merge pull request #7370 from chaen/v8.0_feat_consistency_checks

fstagni · web-flow · commit adfca4dcaf34 · 2024-02-14T16:02:24.000+01:00
[v8.0] Add consistency check tools
diff --git a/consistency_check/README.md b/consistency_check/README.md
@@ -0,0 +1,126 @@
+# Consistency check
+
+this script is here to help compare storage and DFC dumps.
+
+## What you need
+
+### SE definitions
+
+A CSV file containing its name and base path. Like
+
+```
+CSCS-DST;/pnfs/lcg.cscs.ch/lhcb
+CSCS_MC-DST;/pnfs/lcg.cscs.ch/lhcb
+```
+
+You can obtain it with something like
+
+```python
+from DIRAC import initialize
+initialize()
+from DIRAC import gConfig
+from DIRAC.Resources.Storage.StorageElement import StorageElement
+
+for se in gConfig.getSections("/Resources/StorageElements")["Value"]:
+    print(f"{se};{list(StorageElement(se).storages.values())[0].basePath}")
+```
+
+### StorageElement dump
+
+This is typically provided by the site, and we expect just a flat list of the files
+
+```
+/pnfs/lcg.cscs.ch/lhcb/generated/2013-07-07/fileeed071eb-1aa0-4d00-8775-79624737224e
+/pnfs/lcg.cscs.ch/lhcb/generated/2013-07-10/fileed08b040-196c-46d9-b4d6-37d80cba27eb
+/pnfs/lcg.cscs.ch/lhcb/lhcb/test/SAM/testfile-put-LHCb-Disk-1494915199-61e6d085bb84.txt
+```
+
+### Catalog dump(s)
+
+Ideally, you should have two catalog dumps for the SE that you are concerned about: one before the SE dump, and one after. Having only one of the two only allows to get partial comparison
+
+You could get it with a script like
+
+```python
+import sys
+from datetime import datetime,timezone
+from DIRAC import initialize
+initialize()
+from DIRAC import gConfig
+from DIRAC.Resources.Catalog.FileCatalogClient import FileCatalogClient
+dfc = FileCatalogClient()
+
+# Something like LCG.CERN.ch
+site_name = sys.argv[1]
+
+ses = gConfig.getOption(f"/Resources/Sites/{site_name.split('.')[0]}/{site_name}/SE",[])["Value"]
+
+timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
+output_file = f"{site_name}_dfc_{timestamp}.dump"
+print(f"Getting FC dump for {ses} in {output_file}")
+res = dfc.getSEDump(ses, output_file)
+print(res)
+```
+
+
+Or from a `BaseSE`
+
+```python
+#!/usr/bin/env python3
+
+import sys
+from datetime import datetime,timezone
+from DIRAC import initialize
+initialize()
+from DIRAC import gConfig
+from DIRAC.Resources.Catalog.FileCatalogClient import FileCatalogClient
+dfc = FileCatalogClient()
+
+# Something like RAL-ECHO
+base_se_name = sys.argv[1]
+
+ses = []
+ses_data = gConfig.getOptionsDictRecursively(f"/Resources/StorageElements")["Value"]
+for key, val in ses_data.items():
+    try:
+        if val['BaseSE'] == base_se_name:
+            ses.append(key)
+    except (KeyError, TypeError):
+        pass
+
+timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
+output_file = f"{base_se_name}_dfc_{timestamp}.dump"
+print(f"Getting FC dump for {ses} in {output_file}")
+res = dfc.getSEDump(ses, output_file)
+print(res)
+```
+
+## How it works
+
+We look at the differences and the intersections between the dump of the old catalog, the new catalog, and the storage element.
+
+For example, you find dark data by looking at files that are in the SE dump, but not in any of the catalog dump. Lost data is data that is in both catalog dump, but not in the SE dump.
+
+
+| Old FC | New FC | SE | Status           |
+|--------|--------|----|------------------|
+| 0      | 0      | 1  | Dark data        |
+| 0      | 1      | 0  | Very new         |
+| 0      | 1      | 1  | New              |
+| 1      | 0      | 0  | Deleted          |
+| 1      | 0      | 1  | Recently deleted |
+| 1      | 1      | 0  | Lost file        |
+| 1      | 1      | 1  | OK               |
+
+## How to use
+
+Although you probably need DIRAC to be able to get the DFC dump or the SE config, you do not need DIRAC installed once you have all the `csv` files.
+You will however need `pandas` and `typer`
+
+
+The `consistency` script has 3 commands:
+* `threeways`: do a proper comparison of 1 old DFC dump, one SE dump, one new DFC dump. Results are as good as it gets
+* `possibly-dark-data`: Tries to find dark data but be careful of the result (see `help`).
+* `possibly-lost-data`: Tries to find lost data but be careful of the result (see `help`).
+
+In any case, you should check  the output with commands like `dirac-dms-replica-stats` or `dirac-dms-pfn-exists`.
diff --git a/consistency_check/consistency.py b/consistency_check/consistency.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+import pandas as pd
+import typer
+from pathlib import Path
+from typer import colors
+from typing import Annotated
+
+
+RED = colors.RED
+GREEN = colors.GREEN
+
+app = typer.Typer()
+
+
+def load_se_definition(se_def_path):
+    return pd.read_csv(se_def_path, names=["seName", "basePath"], delimiter=";", index_col="seName")
+
+
+def load_dfc_dump(dfc_dump_path, version):
+    fc_dump = pd.read_csv(dfc_dump_path, names=["seName", "lfn", "cks", "size"], delimiter="|")
+    fc_dump["version"] = version
+    return fc_dump
+
+
+def load_se_dump(se_dump_path):
+    se_dump = pd.read_csv(se_dump_path, names=["pfn"], delimiter=";", index_col="pfn")
+    se_dump["version"] = "se_dump"
+    return se_dump
+
+
+@app.command()
+def possibly_lost_data(
+    fc_dump_file: Annotated[Path, typer.Option(help="DFC dump AFTER the SE dump")],
+    se_def_file: Annotated[Path, typer.Option(help="Definition of the SE path")],
+    se_dump_file: Annotated[Path, typer.Option(help="Dump of the SE")],
+    lost_file_output: Annotated[Path, typer.Option(help="Output file in which to dump lost")] = "lost.csv",
+):
+    """
+    DANGER: make a partial comparison of an SE dump and an FC dump to find lost data
+    Be careful because you can't trust the result:
+    * if the FC dump is more recent than the SE dump, you may get files that were added on the SE after the dump
+    * if the FC dump is older than the SE dump, the file may have been purposedly removed
+    """
+    se_dump = load_se_dump(se_dump_file)
+    se_def = load_se_definition(se_def_file)
+
+    # Compute the PFN for each LFN in the DFC dump
+
+    fc_dump = load_dfc_dump(fc_dump_file, "fc")
+    fc_dump = pd.merge(fc_dump, se_def, on="seName")
+    fc_dump["pfn"] = fc_dump["basePath"] + fc_dump["lfn"]
+    fc_dump.set_index("pfn", inplace=True)
+
+    # Lost files: in both FC dump but not in the SE
+
+    lostData = fc_dump.index.difference(se_dump.index)
+    if len(lostData):
+        typer.secho(f"Found {len(lostData)} lost files, dumping them in {lost_file_output}", err=True, fg=RED)
+        lastDataDetail = fc_dump[fc_dump.index.isin(lostData)]
+        lastDataDetail.to_csv(lost_file_output)
+    else:
+        typer.secho("No dark data found", fg=GREEN)
+
+
+@app.command()
+def possibly_dark_data(
+    fc_dump_file: Annotated[Path, typer.Option(help="DFC dump")],
+    se_def_file: Annotated[Path, typer.Option(help="Definition of the SE path")],
+    se_dump_file: Annotated[Path, typer.Option(help="Dump of the SE")],
+    dark_file_output: Annotated[Path, typer.Option(help="Output file in which to dump dark data")] = "dark.csv",
+):
+    """
+    DANGER: make a partial comparison of an SE dump and an FC dump to find dark data.
+    Be careful because you can't trust the result:
+    * if the FC dump is more recent than the SE dump, you may get files that were already removed
+    * if the FC dump is older than the SE dump, you may find files that were added properly after the dump (DANGER)
+    """
+    se_dump = load_se_dump(se_dump_file)
+    se_def = load_se_definition(se_def_file)
+
+    # Compute the PFN for each LFN in the DFC dump
+
+    fc_dump = load_dfc_dump(fc_dump_file, "fc")
+    fc_dump = pd.merge(fc_dump, se_def, on="seName")
+    fc_dump["pfn"] = fc_dump["basePath"] + fc_dump["lfn"]
+    fc_dump.set_index("pfn", inplace=True)
+
+    # Dark data: in the SE dump but not in any of the FC dump
+
+    typer.echo(f"Computing dark data")
+    # Find the dark data
+    darkData = se_dump.index.difference(fc_dump.index)
+
+    if len(darkData):
+        typer.secho(f"Found {len(darkData)} dark data, dumping them in {dark_file_output}", err=True, fg=RED)
+        pd.DataFrame(index=darkData).to_csv(dark_file_output)
+    else:
+        typer.secho("No dark data found", fg=GREEN)
+
+
+@app.command()
+def threeway(
+    old_fc_dump_file: Annotated[Path, typer.Option(help="DFC dump BEFORE the SE dump")],
+    new_fc_dump_file: Annotated[Path, typer.Option(help="DFC dump AFTER the SE dump")],
+    se_def_file: Annotated[Path, typer.Option(help="Definition of the SE path")],
+    se_dump_file: Annotated[Path, typer.Option(help="Dump of the SE")],
+    lost_file_output: Annotated[Path, typer.Option(help="Output file in which to dump lost files")] = "lost.csv",
+    dark_file_output: Annotated[Path, typer.Option(help="Output file in which to dump dark data")] = "dark.csv",
+):
+    """
+    Make a full comparison of two FC dump and one SE dump
+    """
+    se_dump = load_se_dump(se_dump_file)
+    se_def = load_se_definition(se_def_file)
+
+    # Compute the PFN for each LFN in the DFC dump
+    old_fc_dump = load_dfc_dump(old_fc_dump_file, "old_fc")
+    old_fc_dump = pd.merge(old_fc_dump, se_def, on="seName")
+    old_fc_dump["pfn"] = old_fc_dump["basePath"] + old_fc_dump["lfn"]
+    old_fc_dump.set_index("pfn", inplace=True)
+
+    new_fc_dump = load_dfc_dump(new_fc_dump_file, "new_fc")
+    new_fc_dump = pd.merge(new_fc_dump, se_def, on="seName")
+    new_fc_dump["pfn"] = new_fc_dump["basePath"] + new_fc_dump["lfn"]
+    new_fc_dump.set_index("pfn", inplace=True)
+
+    # Dark data: in the SE dump but not in any of the FC dump
+
+    typer.echo(f"Computing dark data")
+    # Find the dark data
+    darkData = se_dump.index.difference(old_fc_dump.index.union(new_fc_dump.index))
+
+    if len(darkData):
+        typer.secho(f"Found {len(darkData)} dark data, dumping them in {dark_file_output}", err=True, fg=RED)
+        pd.DataFrame(index=darkData).to_csv(dark_file_output)
+    else:
+        typer.secho("No dark data found", fg=GREEN)
+
+    # Lost files: in both FC dump but not in the SE
+
+    lostData = (old_fc_dump.index.intersection(new_fc_dump.index)).difference(se_dump.index)
+    if len(lostData):
+        typer.secho(f"Found {len(lostData)} lost files, dumping them in {lost_file_output}", err=True, fg=RED)
+        lastDataDetail = new_fc_dump[new_fc_dump.index.isin(lostData)]
+        lastDataDetail.to_csv(lost_file_output)
+    else:
+        typer.secho("No dark data found", fg=GREEN)
+
+
+if __name__ == "__main__":
+    app()
diff --git a/docs/diracdoctools/scripts/dirac-docs-get-release-notes.py b/docs/diracdoctools/scripts/dirac-docs-get-release-notes.py
@@ -92,11 +92,11 @@ def gitlabSetup(GITLABTOKEN=""):
         SESSION.headers.update({"PRIVATE-TOKEN": GITLABTOKEN})
 
 
-def req2Json(url, parameterDict=None, requestType="GET"):
+def req2Json(url, parameterDict=None, requestType="GET", queryParameters=None):
     """Call to github API using requests package."""
     log = LOGGER.getChild("Requests")
     log.debug("Running %s with %s ", requestType, parameterDict)
-    req = getattr(SESSION, requestType.lower())(url, json=parameterDict)
+    req = getattr(SESSION, requestType.lower())(url, json=parameterDict, params=queryParameters)
     if req.status_code not in (200, 201):
         log.error("Unable to access API: %s", req.text)
         raise RuntimeError("Failed to access API")
@@ -433,7 +433,7 @@ def parseOptions(self):
         for var, val in sorted(vars(parsed).items()):
             log.info("Using options: %s = %s", var, pformat(val))
 
-    def _github(self, action):
+    def _github(self, action, per_page=None):
         """Return the url to perform actions on github.
 
         :param str action: command to use in the gitlab API, see documentation there
@@ -442,6 +442,7 @@ def _github(self, action):
         log = LOGGER.getChild("GitHub")
         options = dict(self._options)
         options["action"] = action
+
         ghURL = f"https://api.github.com/repos/{options['owner']}/{options['repo']}/{options['action']}"
         log.debug("Calling: %s", ghURL)
         return ghURL
@@ -504,7 +505,7 @@ def getGithubLatestTagDate(self, sinceTag):
         log = LOGGER.getChild("getGithubLatestTagDate")
 
         # Get all tags
-        tags = req2Json(url=self._github("tags"))
+        tags = req2Json(url=self._github("tags"), queryParameters={"per_page": 100})
         if isinstance(tags, dict) and "Not Found" in tags.get("message"):
             raise RuntimeError(f"Package not found: {str(self)}")