Skip to content

Commit 612faf8

Browse files
committed
Fixes in counters and HDF5 detection
1 parent 20421d1 commit 612faf8

File tree

1 file changed

+77
-19
lines changed

1 file changed

+77
-19
lines changed

drishti/main.py

Lines changed: 77 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,17 @@
4949

5050
THRESHOLD_OPERATION_IMBALANCE = 0.1
5151
THRESHOLD_SMALL_REQUESTS = 0.1
52+
THRESHOLD_SMALL_REQUESTS_ABSOLUTE = 1000
5253
THRESHOLD_MISALIGNED_REQUESTS = 0.1
5354
THRESHOLD_METADATA = 0.1
5455
THRESHOLD_METADATA_TIME_RANK = 30 # seconds
5556
THRESHOLD_RANDOM_OPERATIONS = 0.2
57+
THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE = 1000
5658
THRESHOLD_STRAGGLERS = 0.15
5759
THRESHOLD_IMBALANCE = 0.30
5860
THRESHOLD_INTERFACE_STDIO = 0.1
5961
THRESHOLD_COLLECTIVE_OPERATIONS = 0.5
62+
THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE = 1000
6063

6164
INSIGHTS_STDIO_HIGH_USAGE = 'S01'
6265
INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE = 'P01'
@@ -487,18 +490,35 @@ def main():
487490
#########################################################################################################################################################################
488491

489492
# Get the number of small I/O operations (less than 1 MB)
490-
total_reads_small = df['counters']['POSIX_SIZE_READ_0_100'].sum() + df['counters']['POSIX_SIZE_READ_1K_10K'].sum() + df['counters']['POSIX_SIZE_READ_100K_1M'].sum()
493+
total_reads_small = (
494+
df['counters']['POSIX_SIZE_READ_0_100'].sum() +
495+
df['counters']['POSIX_SIZE_READ_100_1K'].sum() +
496+
df['counters']['POSIX_SIZE_READ_1K_10K'].sum() +
497+
df['counters']['POSIX_SIZE_READ_10K_100K'].sum() +
498+
df['counters']['POSIX_SIZE_READ_100K_1M'].sum()
499+
)
491500

492501
# Get the files responsible for more than half of these accesses
493502
files = []
494503

495-
df['counters']['INSIGHTS_POSIX_SMALL'] = df['counters']['POSIX_SIZE_READ_0_100'] + df['counters']['POSIX_SIZE_READ_1K_10K'] + df['counters']['POSIX_SIZE_READ_100K_1M']
504+
df['counters']['INSIGHTS_POSIX_SMALL'] = (
505+
df['counters']['POSIX_SIZE_READ_0_100'] +
506+
df['counters']['POSIX_SIZE_READ_100_1K'] +
507+
df['counters']['POSIX_SIZE_READ_1K_10K'] +
508+
df['counters']['POSIX_SIZE_READ_10K_100K'] +
509+
df['counters']['POSIX_SIZE_WRITE_100K_1M'] +
510+
df['counters']['POSIX_SIZE_WRITE_0_100'] +
511+
df['counters']['POSIX_SIZE_WRITE_100_1K'] +
512+
df['counters']['POSIX_SIZE_WRITE_1K_10K'] +
513+
df['counters']['POSIX_SIZE_WRITE_10K_100K'] +
514+
df['counters']['POSIX_SIZE_WRITE_100K_1M']
515+
)
496516

497517
detected_files = pd.DataFrame(df['counters'].groupby('id')['INSIGHTS_POSIX_SMALL'].sum()).reset_index()
498518
detected_files.columns = ['id', 'total']
499519
detected_files.loc[:, 'id'] = detected_files.loc[:, 'id'].astype(str)
500520

501-
if total_reads_small and total_reads_small / total_reads > THRESHOLD_SMALL_REQUESTS:
521+
if total_reads_small and total_reads_small / total_reads > THRESHOLD_SMALL_REQUESTS and total_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE:
502522
issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read/write requests'.format(
503523
total_reads_small, total_reads_small / total_reads * 100.0
504524
)
@@ -543,9 +563,15 @@ def main():
543563
)
544564

545565
# Get the number of small I/O operations (less than the stripe size)
546-
total_writes_small = df['counters']['POSIX_SIZE_WRITE_0_100'].sum() + df['counters']['POSIX_SIZE_WRITE_1K_10K'].sum() + df['counters']['POSIX_SIZE_WRITE_100K_1M'].sum()
566+
total_writes_small = (
567+
df['counters']['POSIX_SIZE_WRITE_0_100'].sum() +
568+
df['counters']['POSIX_SIZE_WRITE_100_1K'].sum() +
569+
df['counters']['POSIX_SIZE_WRITE_1K_10K'].sum() +
570+
df['counters']['POSIX_SIZE_WRITE_10K_100K'].sum() +
571+
df['counters']['POSIX_SIZE_WRITE_100K_1M'].sum()
572+
)
547573

548-
if total_writes_small and total_writes_small / total_writes > THRESHOLD_SMALL_REQUESTS:
574+
if total_writes_small and total_writes_small / total_writes > THRESHOLD_SMALL_REQUESTS and total_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE:
549575
issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all read/write requests'.format(
550576
total_writes_small, total_writes_small / total_writes * 100.0
551577
)
@@ -676,7 +702,7 @@ def main():
676702
#print('READ Random: {} ({:.2f}%)'.format(read_random, read_random / total_reads * 100))
677703

678704
if total_reads:
679-
if read_random and read_random / total_reads > THRESHOLD_RANDOM_OPERATIONS:
705+
if read_random and read_random / total_reads > THRESHOLD_RANDOM_OPERATIONS and read_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE:
680706
issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)'.format(
681707
read_random, read_random / total_reads * 100.0
682708
)
@@ -711,7 +737,7 @@ def main():
711737
#print('WRITE Random: {} ({:.2f}%)'.format(write_random, write_random / total_writes * 100))
712738

713739
if total_writes:
714-
if write_random and write_random / total_writes > THRESHOLD_RANDOM_OPERATIONS:
740+
if write_random and write_random / total_writes > THRESHOLD_RANDOM_OPERATIONS and write_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE:
715741
issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)'.format(
716742
write_random, write_random / total_writes * 100.0
717743
)
@@ -746,11 +772,23 @@ def main():
746772

747773
if not shared_files.empty:
748774
total_shared_reads = shared_files['POSIX_READS'].sum()
749-
total_shared_reads_small = shared_files['POSIX_SIZE_READ_0_100'].sum() + shared_files['POSIX_SIZE_READ_1K_10K'].sum() + shared_files['POSIX_SIZE_READ_100K_1M'].sum()
775+
total_shared_reads_small = (
776+
shared_files['POSIX_SIZE_READ_0_100'].sum() +
777+
shared_files['POSIX_SIZE_READ_100_1K'].sum() +
778+
shared_files['POSIX_SIZE_READ_1K_10K'].sum() +
779+
shared_files['POSIX_SIZE_READ_10K_100K'].sum() +
780+
shared_files['POSIX_SIZE_READ_100K_1M'].sum()
781+
)
750782

751-
shared_files['INSIGHTS_POSIX_SMALL_READS'] = shared_files['POSIX_SIZE_READ_0_100'] + shared_files['POSIX_SIZE_READ_1K_10K'] + shared_files['POSIX_SIZE_READ_100K_1M']
783+
shared_files['INSIGHTS_POSIX_SMALL_READS'] = (
784+
shared_files['POSIX_SIZE_READ_0_100'] +
785+
shared_files['POSIX_SIZE_READ_100_1K'] +
786+
shared_files['POSIX_SIZE_READ_1K_10K'] +
787+
shared_files['POSIX_SIZE_READ_10K_100K'] +
788+
shared_files['POSIX_SIZE_READ_100K_1M']
789+
)
752790

753-
if total_shared_reads and total_shared_reads_small / total_shared_reads > THRESHOLD_SMALL_REQUESTS:
791+
if total_shared_reads and total_shared_reads_small / total_shared_reads > THRESHOLD_SMALL_REQUESTS and total_shared_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE:
754792
issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests'.format(
755793
total_shared_reads_small, total_shared_reads_small / total_shared_reads * 100.0
756794
)
@@ -781,11 +819,23 @@ def main():
781819
)
782820

783821
total_shared_writes = shared_files['POSIX_WRITES'].sum()
784-
total_shared_writes_small = shared_files['POSIX_SIZE_WRITE_0_100'].sum() + shared_files['POSIX_SIZE_WRITE_1K_10K'].sum() + shared_files['POSIX_SIZE_WRITE_100K_1M'].sum()
822+
total_shared_writes_small = (
823+
shared_files['POSIX_SIZE_WRITE_0_100'].sum() +
824+
shared_files['POSIX_SIZE_WRITE_100_1K'].sum() +
825+
shared_files['POSIX_SIZE_WRITE_1K_10K'].sum() +
826+
shared_files['POSIX_SIZE_WRITE_10K_100K'].sum() +
827+
shared_files['POSIX_SIZE_WRITE_100K_1M'].sum()
828+
)
785829

786-
shared_files['INSIGHTS_POSIX_SMALL_WRITES'] = shared_files['POSIX_SIZE_WRITE_0_100'] + shared_files['POSIX_SIZE_WRITE_1K_10K'] + shared_files['POSIX_SIZE_WRITE_100K_1M']
830+
shared_files['INSIGHTS_POSIX_SMALL_WRITES'] = (
831+
shared_files['POSIX_SIZE_WRITE_0_100'] +
832+
shared_files['POSIX_SIZE_WRITE_100_1K'] +
833+
shared_files['POSIX_SIZE_WRITE_1K_10K'] +
834+
shared_files['POSIX_SIZE_WRITE_10K_100K'] +
835+
shared_files['POSIX_SIZE_WRITE_100K_1M']
836+
)
787837

788-
if total_shared_writes and total_shared_writes_small / total_shared_writes > THRESHOLD_SMALL_REQUESTS:
838+
if total_shared_writes and total_shared_writes_small / total_shared_writes > THRESHOLD_SMALL_REQUESTS and total_shared_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE:
789839
issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests'.format(
790840
total_shared_writes_small, total_shared_writes_small / total_shared_writes * 100.0
791841
)
@@ -1089,7 +1139,7 @@ def main():
10891139
total_mpiio_read_operations = df_mpiio['counters']['MPIIO_INDEP_READS'].sum() + df_mpiio['counters']['MPIIO_COLL_READS'].sum()
10901140

10911141
if df_mpiio['counters']['MPIIO_COLL_READS'].sum() == 0:
1092-
if total_mpiio_read_operations:
1142+
if total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE:
10931143
issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls'.format(
10941144
df_mpiio['counters']['MPIIO_INDEP_READS'].sum(),
10951145
df_mpiio['counters']['MPIIO_INDEP_READS'].sum() / (total_mpiio_read_operations) * 100
@@ -1100,7 +1150,7 @@ def main():
11001150
files = pd.DataFrame(df_mpiio_collective_reads.groupby('id').sum()).reset_index()
11011151

11021152
for index, row in df_mpiio_collective_reads.iterrows():
1103-
if (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS:
1153+
if (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and row['MPIIO_INDEP_READS'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS and (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE:
11041154
detail.append(
11051155
{
11061156
'message': '{} ({}%) of independent reads to "{}"'.format(
@@ -1136,7 +1186,7 @@ def main():
11361186
total_mpiio_write_operations = df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() + df_mpiio['counters']['MPIIO_COLL_WRITES'].sum()
11371187

11381188
if df_mpiio['counters']['MPIIO_COLL_WRITES'].sum() == 0:
1139-
if total_mpiio_write_operations:
1189+
if total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE:
11401190
issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls'.format(
11411191
df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum(),
11421192
df_mpiio['counters']['MPIIO_INDEP_WRITES'].sum() / (total_mpiio_write_operations) * 100
@@ -1147,7 +1197,7 @@ def main():
11471197
files = pd.DataFrame(df_mpiio_collective_writes.groupby('id').sum()).reset_index()
11481198

11491199
for index, row in df_mpiio_collective_writes.iterrows():
1150-
if (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS:
1200+
if (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) and row['MPIIO_INDEP_WRITES'] / (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS and (row['MPIIO_INDEP_READS'] + row['MPIIO_INDEP_WRITES']) > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE:
11511201
detail.append(
11521202
{
11531203
'message': '{} ({}%) independent writes to "{}"'.format(
@@ -1182,12 +1232,20 @@ def main():
11821232

11831233
# Look for usage of non-block operations
11841234

1235+
# Look for HDF5 file extension
1236+
1237+
has_hdf5_extension = False
1238+
1239+
for index, row in df_mpiio['counters'].iterrows():
1240+
if file_map[int(row['id'])].endswith('.h5') or file_map[int(row['id'])].endswith('.hdf5'):
1241+
has_hdf5_extension = True
1242+
11851243
if df_mpiio['counters']['MPIIO_NB_READS'].sum() == 0:
11861244
issue = 'Application could benefit from non-blocking (asynchronous) reads'
11871245

11881246
recommendation = []
11891247

1190-
if 'H5F' in modules:
1248+
if 'H5F' in modules or has_hdf5_extension:
11911249
recommendation.append(
11921250
{
11931251
'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)',
@@ -1212,7 +1270,7 @@ def main():
12121270

12131271
recommendation = []
12141272

1215-
if 'H5F' in modules:
1273+
if 'H5F' in modules or has_hdf5_extension:
12161274
recommendation.append(
12171275
{
12181276
'message': 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)',

0 commit comments

Comments
 (0)