Skip to content
Open
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions git_project/glob_feature/glob_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file.
# It uses sequence matching and grouping techniques to group similar file paths together and generates an HTML report of the grouped paths and their counts
# Command to run python glob_feature.py -p path/to/log/file.darshan


import argparse
import pandas as pd
import difflib
import darshan
import re
import os


def make_path_grouper():
matcher = difflib.SequenceMatcher()
def group_paths(paths):
if not matcher.a:
matcher.set_seq1(paths)
return paths
else:
matcher.set_seq2(paths)
similarity_ratio = matcher.ratio()
if similarity_ratio >= 0.8:
return matcher.a
else:
matcher.set_seq1(paths)
return paths
return group_paths


def regex_df_condenser(df, paths):
path_grouper_func = make_path_grouper()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function seems to discard the filenames, so you may need to preserve them in another data structure i.e., df_orig = df.copy() and then use difflib to also compare the filenames later below (difflib really is great, but it requires quite some effort to get it to do exactly what you want).

This function is also pretty confusing--I think you should perhaps write it out on paper or the whiteboard first and then add comments to each section explaining out how it will get you to this (or similar) final output for e3sm_io_heatmap_only.darshan:

image

Also, now that I look at it, perhaps it is a bit hard to see the regex metacharacter \d in the path, so perhaps it may make sense to underline/highlight it somehow, though that's more of an aesthetic concern for a bit later.


df["filename_glob"] = df["filename_glob"].apply(path_grouper_func)

df = df.groupby("filename_glob").size().reset_index(name="glob_count")

df = df.sort_values(by="glob_count", ascending=False)


def find_common_prefix(paths):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no reason to nest this function inside of regex_df_condenser -- it makes it harder to test it because it isn't in the main namespace anymore and it is also just harder to read when nested like this.

# Sort the paths in lexicographical order
sorted_paths = sorted(paths)

# Find the common prefix
common_prefix = os.path.commonprefix(sorted_paths)

# Trim the common prefix to the last path separator
last_separator = common_prefix.rfind(os.path.sep)
common_prefix = common_prefix[:last_separator+1] if last_separator >= 0 else common_prefix

return common_prefix


for group in df["filename_glob"].unique():
group_df = df[df["filename_glob"] == group]
common_path = find_common_prefix(group_df["filename_glob"])
df.loc[df["filename_glob"] == group, "filename_glob"] = common_path


df["filename_glob"] = df.apply(lambda row: (row["filename_glob"]) + r".*", axis=1)

return df



def main(log_path, output_path):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the output_path argument isn't used in this function anymore

report = darshan.DarshanReport(log_path)


df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"])

df = df[df["filename_glob"].str.contains(r"/.*")]
df["glob_count"] = 1
df = regex_df_condenser(df, df["filename_glob"])

style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"])
style.hide(axis="index")
style.set_table_styles([
{"selector": "", "props": [("border", "1px solid grey")]},
{"selector": "tbody td", "props": [("border", "1px solid grey")]},
{"selector": "th", "props": [("border", "1px solid grey")]}
])
html = style.to_html()

# can change name of the output html report here
with open("name_record_glob_hd5f.html", "w") as html_file:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You actually can't change the name on the command line with --output-path anymore because you've hardcoded this, so you should decide if you want to keep that feature or automatically name the output html table, then write a test for it using pytest so it doesn't break in the future.

html_file.write(html)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--log-path', type=str, help="Path to the log file")
parser.add_argument('-o', '--output-path', type=str, help="Path to the output HTML file")
args = parser.parse_args()
main(log_path=args.log_path , output_path=args.output_path)