From 10ed58a1e8d1157474fda23e9f40d154d490599d Mon Sep 17 00:00:00 2001
From: Tyler Reddy <tyler.je.reddy@gmail.com>
Date: Tue, 14 Feb 2023 19:28:53 -0700
Subject: [PATCH] TST: add Python-level filter test

* add a regression test comparing the manual Perl-based
single-file filtering prior to accumulator API usage vs.
the current Python "API" equivalent

* we'll probably want documentation demonstrating this as well,
but the regression test is probably higher priority in the short
term

* note also that we would currently place the burden of expanding
the dataframes to include the file paths/names proper up to the end user
(currently only have columns with the `id`/hash) if they wanted to
filter by regex; that said, we do have library functions that do this
kind of thing internally anyway... all of this is to say that just
writing this test will probably expose some design thinking, or at least
reveal some of the more complex things we may want in our docs for
filtering in Python route

* finally, note that Phil C. did forward a C program and some other
assets for developing a test like this; however, it is far easier
for me to just perform the steps manually on log files that are already
in the logs repo (that was kind of what I was hoping for--expected
values for counters/derived metrics given a log file and a single file
hash, but I've just gone ahead and pulled them out myself)
---
 .../pydarshan/darshan/tests/test_lib_accum.py | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/darshan-util/pydarshan/darshan/tests/test_lib_accum.py b/darshan-util/pydarshan/darshan/tests/test_lib_accum.py
index be3a55b56..203c63e8c 100644
--- a/darshan-util/pydarshan/darshan/tests/test_lib_accum.py
+++ b/darshan-util/pydarshan/darshan/tests/test_lib_accum.py
@@ -4,6 +4,7 @@
 from darshan.log_utils import get_log_path
 
 import pytest
+from numpy.testing import assert_allclose
 
 
 @pytest.mark.parametrize("log_path, mod_name, expected_str", [
@@ -92,3 +93,72 @@ def test_derived_metrics_bytes_and_bandwidth(log_path, mod_name, expected_str):
                 actual_str = log_get_bytes_bandwidth(derived_metrics=derived_metrics,
                                                      mod_name=mod_name)
                 assert actual_str == expected_str
+
+
+
+@pytest.mark.parametrize("log_name, mod_name, file_hash_or_name, expected_slowest_rank, expected_slowest_rank_io_time, expected_total_bytes", [
+    # follow Phil C. instructions to manually achieve
+    # expected values from the old/Perl approach to selecting
+    # a single file
+
+    # darshan-parser e3sm_io_heatmap_only.darshan | grep -E POSIX_OPENS
+
+    # select just one of the file hashes to filter out:
+    # ...
+    # POSIX   454 6966057185861764086     POSIX_OPENS 1   /projects/radix-io/snyder/e3sm/can_I_out_h0.nc  /   overlay
+    # POSIX   454 12386309978061520508    POSIX_OPENS 1   /projects/radix-io/snyder/e3sm/can_I_out_h1.nc  /   overlay
+
+    # darshan-convert --file 6966057185861764086 e3sm_io_heatmap_only.darshan e3sm_single_file.darshan
+
+    # get expected values
+    # darshan-parser --total e3sm_single_file.darshan
+    # darshan-parser --perf e3sm_single_file.darshan
+    # darshan-parser --file e3sm_single_file.darshan
+    ("e3sm_io_heatmap_only.darshan",
+     "POSIX",
+     6966057185861764086,
+     4,
+     264.642345,
+     303145177708),
+    # similarly for MPI-IO
+    ("e3sm_io_heatmap_only.darshan",
+     "MPI-IO",
+     6966057185861764086,
+     0,
+     0.0,
+     77056742332),
+    ])
+def test_pre_filtering_by_file_hash(log_name,
+                                    mod_name,
+                                    file_hash_or_name,
+                                    expected_slowest_rank,
+                                    expected_slowest_rank_io_time,
+                                    expected_total_bytes):
+    # ensure that we match the old Perl tools for aggregating
+    # on a specific file hash
+
+    # TODO: perhaps expand to include filename regex filtering; however,
+    # at the time of writing, rec_dict only contains the *hashes* of the filepaths
+    # so we'd need to do a bunch more work to first expand our DataFrames to perform
+    # this type of filtering
+    log_path = get_log_path(log_name)
+    with darshan.DarshanReport(log_path, read_all=True) as report:
+        report.mod_read_all_records(mod_name, dtype="pandas")
+        rec_dict = report.records[mod_name][0]
+        # manual filtering of df before using C accumulator API
+        counters = rec_dict["counters"]
+        fcounters = rec_dict["fcounters"]
+        counters = counters[counters.id == file_hash_or_name]
+        fcounters = fcounters[fcounters.id == file_hash_or_name]
+        rec_dict["counters"] = counters
+        rec_dict["fcounters"] = fcounters
+        nprocs = report.metadata['job']['nprocs']
+        # now use the C accumulator API
+        derived_metrics = log_get_derived_metrics(rec_dict, mod_name, nprocs)
+        actual_slowest_rank = derived_metrics.unique_io_slowest_rank
+        actual_slowest_rank_io_time = derived_metrics.unique_io_total_time_by_slowest
+        actual_total_bytes = derived_metrics.total_bytes
+
+        assert actual_slowest_rank == expected_slowest_rank
+        assert_allclose(actual_slowest_rank_io_time, expected_slowest_rank_io_time)
+        assert actual_total_bytes == expected_total_bytes