diff --git a/darshan-util/pydarshan/darshan/report.py b/darshan-util/pydarshan/darshan/report.py index afdc4c683..0c307d5c7 100644 --- a/darshan-util/pydarshan/darshan/report.py +++ b/darshan-util/pydarshan/darshan/report.py @@ -306,7 +306,8 @@ def __init__(self, filename=None, dtype='numpy', start_time=None, end_time=None, automatic_summary=False, - read_all=True, lookup_name_records=True): + read_all=True, lookup_name_records=True, + strict=False): """ Args: filename (str): filename to open (optional) @@ -314,12 +315,15 @@ def __init__(self, automatic_summary (bool): automatically generate summary after loading read_all (bool): whether to read all records for log lookup_name_records (bool): lookup and update name_records as records are loaded + strict (bool): error out if invalid log file content is detected (i.e., a + counter that should be >= 0 but is not) Return: None """ self.filename = filename + self.strict = strict self.log = None # Behavioral Options @@ -373,6 +377,26 @@ def __init__(self, if filename: self.open(filename, read_all=read_all) + if self.strict: + # TODO: a more thorough checking for bad log data + for mod_name, mod in self.records.items(): + mod_df_dict = mod.to_df() + mod_counters_df = mod_df_dict.get("counters") + mod_fcounters_df = mod_df_dict.get("fcounters") + for df in [mod_counters_df, mod_fcounters_df]: + # all "TIMER" columns should have values >= -1 + # NOTE: darshan sometimes uses -1 for missing values + # which we may want to eventually avoid, but for now + # we will not raise an error unless below -1 for a time + # value that should be positive + for column_name in df.columns: + if "time" in column_name.lower(): + if df[column_name].min() < -1: + # note: we may want to use a custom error type + # here for "invalid logs" + raise ValueError(f"Invalid log file; negative value in {column_name}") + + @property def metadata(self): diff --git a/darshan-util/pydarshan/darshan/tests/test_report.py b/darshan-util/pydarshan/darshan/tests/test_report.py index 50b2d96f5..0dfe0ff9f 100644 --- a/darshan-util/pydarshan/darshan/tests/test_report.py +++ b/darshan-util/pydarshan/darshan/tests/test_report.py @@ -497,3 +497,13 @@ def test_heatmap_df_invalid_operation(): report = darshan.DarshanReport(log_path) with pytest.raises(ValueError, match="invalid_op not in heatmap"): report.heatmaps["POSIX"].to_df(ops=["invalid_op"]) + + +@pytest.mark.parametrize("log_name, error_match", [ + # see: gh-562 + ("sample.darshan", "STDIO_F_WRITE_TIME") +]) +def test_detect_known_invalid_logs(log_name, error_match): + log_path = get_log_path(log_name) + with pytest.raises(ValueError, match=f"Invalid log file.*{error_match}"): + report = darshan.DarshanReport(log_path, strict=True)