-
Couldn't load subscription status.
- Fork 36
WIP glob feature #936
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
WIP glob feature #936
Changes from 3 commits
dce2c0a
8277396
397780a
edc12c3
787c8ed
9b757d5
33f7292
a5df394
452568b
26c2572
cd9d522
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,97 @@ | ||
| # Creates a DataFrame with two columns ("glob_filename" and "glob_count") based on the files read by a .darshan file. | ||
| # It uses sequence matching and grouping techniques to group similar file paths together and generates an HTML report of the grouped paths and their counts | ||
| # Command to run python glob_feature.py -p path/to/log/file.darshan | ||
|
|
||
|
|
||
| import argparse | ||
| import pandas as pd | ||
| import difflib | ||
| import darshan | ||
| import re | ||
| import os | ||
|
|
||
|
|
||
| def make_path_grouper(): | ||
| matcher = difflib.SequenceMatcher() | ||
| def group_paths(paths): | ||
| if not matcher.a: | ||
| matcher.set_seq1(paths) | ||
| return paths | ||
| else: | ||
| matcher.set_seq2(paths) | ||
| similarity_ratio = matcher.ratio() | ||
| if similarity_ratio >= 0.8: | ||
| return matcher.a | ||
| else: | ||
| matcher.set_seq1(paths) | ||
| return paths | ||
| return group_paths | ||
|
|
||
|
|
||
| def regex_df_condenser(df, paths): | ||
| path_grouper_func = make_path_grouper() | ||
|
|
||
| df["filename_glob"] = df["filename_glob"].apply(path_grouper_func) | ||
|
|
||
| df = df.groupby("filename_glob").size().reset_index(name="glob_count") | ||
|
|
||
| df = df.sort_values(by="glob_count", ascending=False) | ||
|
|
||
|
|
||
| def find_common_prefix(paths): | ||
|
||
| # Sort the paths in lexicographical order | ||
| sorted_paths = sorted(paths) | ||
|
|
||
| # Find the common prefix | ||
| common_prefix = os.path.commonprefix(sorted_paths) | ||
|
|
||
| # Trim the common prefix to the last path separator | ||
| last_separator = common_prefix.rfind(os.path.sep) | ||
| common_prefix = common_prefix[:last_separator+1] if last_separator >= 0 else common_prefix | ||
|
|
||
| return common_prefix | ||
|
|
||
|
|
||
| for group in df["filename_glob"].unique(): | ||
| group_df = df[df["filename_glob"] == group] | ||
| common_path = find_common_prefix(group_df["filename_glob"]) | ||
| df.loc[df["filename_glob"] == group, "filename_glob"] = common_path | ||
|
|
||
|
|
||
| df["filename_glob"] = df.apply(lambda row: (row["filename_glob"]) + r".*", axis=1) | ||
|
|
||
| return df | ||
|
|
||
|
|
||
|
|
||
| def main(log_path, output_path): | ||
|
||
| report = darshan.DarshanReport(log_path) | ||
|
|
||
|
|
||
| df = pd.DataFrame.from_dict(report.name_records, orient="index", columns=["filename_glob"]) | ||
|
|
||
| df = df[df["filename_glob"].str.contains(r"/.*")] | ||
| df["glob_count"] = 1 | ||
| df = regex_df_condenser(df, df["filename_glob"]) | ||
|
|
||
| style = df.style.background_gradient(axis=0, cmap="viridis", gmap=df["glob_count"]) | ||
| style.hide(axis="index") | ||
| style.set_table_styles([ | ||
| {"selector": "", "props": [("border", "1px solid grey")]}, | ||
| {"selector": "tbody td", "props": [("border", "1px solid grey")]}, | ||
| {"selector": "th", "props": [("border", "1px solid grey")]} | ||
| ]) | ||
| html = style.to_html() | ||
|
|
||
| # can change name of the output html report here | ||
| with open("name_record_glob_hd5f.html", "w") as html_file: | ||
|
||
| html_file.write(html) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| parser = argparse.ArgumentParser() | ||
| parser.add_argument('-p', '--log-path', type=str, help="Path to the log file") | ||
| parser.add_argument('-o', '--output-path', type=str, help="Path to the output HTML file") | ||
| args = parser.parse_args() | ||
| main(log_path=args.log_path , output_path=args.output_path) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function seems to discard the filenames, so you may need to preserve them in another data structure i.e.,
df_orig = df.copy()and then usedifflibto also compare the filenames later below (difflibreally is great, but it requires quite some effort to get it to do exactly what you want).This function is also pretty confusing--I think you should perhaps write it out on paper or the whiteboard first and then add comments to each section explaining out how it will get you to this (or similar) final output for
e3sm_io_heatmap_only.darshan:Also, now that I look at it, perhaps it is a bit hard to see the regex metacharacter
\din the path, so perhaps it may make sense to underline/highlight it somehow, though that's more of an aesthetic concern for a bit later.