4949
5050THRESHOLD_OPERATION_IMBALANCE = 0.1
5151THRESHOLD_SMALL_REQUESTS = 0.1
52+ THRESHOLD_SMALL_REQUESTS_ABSOLUTE = 1000
5253THRESHOLD_MISALIGNED_REQUESTS = 0.1
5354THRESHOLD_METADATA = 0.1
5455THRESHOLD_METADATA_TIME_RANK = 30 # seconds
5556THRESHOLD_RANDOM_OPERATIONS = 0.2
57+ THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE = 1000
5658THRESHOLD_STRAGGLERS = 0.15
5759THRESHOLD_IMBALANCE = 0.30
5860THRESHOLD_INTERFACE_STDIO = 0.1
5961THRESHOLD_COLLECTIVE_OPERATIONS = 0.5
62+ THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE = 1000
6063
6164INSIGHTS_STDIO_HIGH_USAGE = 'S01'
6265INSIGHTS_POSIX_WRITE_COUNT_INTENSIVE = 'P01'
@@ -487,18 +490,35 @@ def main():
487490 #########################################################################################################################################################################
488491
489492 # Get the number of small I/O operations (less than 1 MB)
490- total_reads_small = df ['counters' ]['POSIX_SIZE_READ_0_100' ].sum () + df ['counters' ]['POSIX_SIZE_READ_1K_10K' ].sum () + df ['counters' ]['POSIX_SIZE_READ_100K_1M' ].sum ()
493+ total_reads_small = (
494+ df ['counters' ]['POSIX_SIZE_READ_0_100' ].sum () +
495+ df ['counters' ]['POSIX_SIZE_READ_100_1K' ].sum () +
496+ df ['counters' ]['POSIX_SIZE_READ_1K_10K' ].sum () +
497+ df ['counters' ]['POSIX_SIZE_READ_10K_100K' ].sum () +
498+ df ['counters' ]['POSIX_SIZE_READ_100K_1M' ].sum ()
499+ )
491500
492501 # Get the files responsible for more than half of these accesses
493502 files = []
494503
495- df ['counters' ]['INSIGHTS_POSIX_SMALL' ] = df ['counters' ]['POSIX_SIZE_READ_0_100' ] + df ['counters' ]['POSIX_SIZE_READ_1K_10K' ] + df ['counters' ]['POSIX_SIZE_READ_100K_1M' ]
504+ df ['counters' ]['INSIGHTS_POSIX_SMALL' ] = (
505+ df ['counters' ]['POSIX_SIZE_READ_0_100' ] +
506+ df ['counters' ]['POSIX_SIZE_READ_100_1K' ] +
507+ df ['counters' ]['POSIX_SIZE_READ_1K_10K' ] +
508+ df ['counters' ]['POSIX_SIZE_READ_10K_100K' ] +
509+ df ['counters' ]['POSIX_SIZE_WRITE_100K_1M' ] +
510+ df ['counters' ]['POSIX_SIZE_WRITE_0_100' ] +
511+ df ['counters' ]['POSIX_SIZE_WRITE_100_1K' ] +
512+ df ['counters' ]['POSIX_SIZE_WRITE_1K_10K' ] +
513+ df ['counters' ]['POSIX_SIZE_WRITE_10K_100K' ] +
514+ df ['counters' ]['POSIX_SIZE_WRITE_100K_1M' ]
515+ )
496516
497517 detected_files = pd .DataFrame (df ['counters' ].groupby ('id' )['INSIGHTS_POSIX_SMALL' ].sum ()).reset_index ()
498518 detected_files .columns = ['id' , 'total' ]
499519 detected_files .loc [:, 'id' ] = detected_files .loc [:, 'id' ].astype (str )
500520
501- if total_reads_small and total_reads_small / total_reads > THRESHOLD_SMALL_REQUESTS :
521+ if total_reads_small and total_reads_small / total_reads > THRESHOLD_SMALL_REQUESTS and total_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE :
502522 issue = 'Application issues a high number ({}) of small read requests (i.e., < 1MB) which represents {:.2f}% of all read/write requests' .format (
503523 total_reads_small , total_reads_small / total_reads * 100.0
504524 )
@@ -543,9 +563,15 @@ def main():
543563 )
544564
545565 # Get the number of small I/O operations (less than the stripe size)
546- total_writes_small = df ['counters' ]['POSIX_SIZE_WRITE_0_100' ].sum () + df ['counters' ]['POSIX_SIZE_WRITE_1K_10K' ].sum () + df ['counters' ]['POSIX_SIZE_WRITE_100K_1M' ].sum ()
566+ total_writes_small = (
567+ df ['counters' ]['POSIX_SIZE_WRITE_0_100' ].sum () +
568+ df ['counters' ]['POSIX_SIZE_WRITE_100_1K' ].sum () +
569+ df ['counters' ]['POSIX_SIZE_WRITE_1K_10K' ].sum () +
570+ df ['counters' ]['POSIX_SIZE_WRITE_10K_100K' ].sum () +
571+ df ['counters' ]['POSIX_SIZE_WRITE_100K_1M' ].sum ()
572+ )
547573
548- if total_writes_small and total_writes_small / total_writes > THRESHOLD_SMALL_REQUESTS :
574+ if total_writes_small and total_writes_small / total_writes > THRESHOLD_SMALL_REQUESTS and total_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE :
549575 issue = 'Application issues a high number ({}) of small write requests (i.e., < 1MB) which represents {:.2f}% of all read/write requests' .format (
550576 total_writes_small , total_writes_small / total_writes * 100.0
551577 )
@@ -676,7 +702,7 @@ def main():
676702 #print('READ Random: {} ({:.2f}%)'.format(read_random, read_random / total_reads * 100))
677703
678704 if total_reads :
679- if read_random and read_random / total_reads > THRESHOLD_RANDOM_OPERATIONS :
705+ if read_random and read_random / total_reads > THRESHOLD_RANDOM_OPERATIONS and read_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE :
680706 issue = 'Application is issuing a high number ({}) of random read operations ({:.2f}%)' .format (
681707 read_random , read_random / total_reads * 100.0
682708 )
@@ -711,7 +737,7 @@ def main():
711737 #print('WRITE Random: {} ({:.2f}%)'.format(write_random, write_random / total_writes * 100))
712738
713739 if total_writes :
714- if write_random and write_random / total_writes > THRESHOLD_RANDOM_OPERATIONS :
740+ if write_random and write_random / total_writes > THRESHOLD_RANDOM_OPERATIONS and write_random > THRESHOLD_RANDOM_OPERATIONS_ABSOLUTE :
715741 issue = 'Application is issuing a high number ({}) of random write operations ({:.2f}%)' .format (
716742 write_random , write_random / total_writes * 100.0
717743 )
@@ -746,11 +772,23 @@ def main():
746772
747773 if not shared_files .empty :
748774 total_shared_reads = shared_files ['POSIX_READS' ].sum ()
749- total_shared_reads_small = shared_files ['POSIX_SIZE_READ_0_100' ].sum () + shared_files ['POSIX_SIZE_READ_1K_10K' ].sum () + shared_files ['POSIX_SIZE_READ_100K_1M' ].sum ()
775+ total_shared_reads_small = (
776+ shared_files ['POSIX_SIZE_READ_0_100' ].sum () +
777+ shared_files ['POSIX_SIZE_READ_100_1K' ].sum () +
778+ shared_files ['POSIX_SIZE_READ_1K_10K' ].sum () +
779+ shared_files ['POSIX_SIZE_READ_10K_100K' ].sum () +
780+ shared_files ['POSIX_SIZE_READ_100K_1M' ].sum ()
781+ )
750782
751- shared_files ['INSIGHTS_POSIX_SMALL_READS' ] = shared_files ['POSIX_SIZE_READ_0_100' ] + shared_files ['POSIX_SIZE_READ_1K_10K' ] + shared_files ['POSIX_SIZE_READ_100K_1M' ]
783+ shared_files ['INSIGHTS_POSIX_SMALL_READS' ] = (
784+ shared_files ['POSIX_SIZE_READ_0_100' ] +
785+ shared_files ['POSIX_SIZE_READ_100_1K' ] +
786+ shared_files ['POSIX_SIZE_READ_1K_10K' ] +
787+ shared_files ['POSIX_SIZE_READ_10K_100K' ] +
788+ shared_files ['POSIX_SIZE_READ_100K_1M' ]
789+ )
752790
753- if total_shared_reads and total_shared_reads_small / total_shared_reads > THRESHOLD_SMALL_REQUESTS :
791+ if total_shared_reads and total_shared_reads_small / total_shared_reads > THRESHOLD_SMALL_REQUESTS and total_shared_reads_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE :
754792 issue = 'Application issues a high number ({}) of small read requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file read requests' .format (
755793 total_shared_reads_small , total_shared_reads_small / total_shared_reads * 100.0
756794 )
@@ -781,11 +819,23 @@ def main():
781819 )
782820
783821 total_shared_writes = shared_files ['POSIX_WRITES' ].sum ()
784- total_shared_writes_small = shared_files ['POSIX_SIZE_WRITE_0_100' ].sum () + shared_files ['POSIX_SIZE_WRITE_1K_10K' ].sum () + shared_files ['POSIX_SIZE_WRITE_100K_1M' ].sum ()
822+ total_shared_writes_small = (
823+ shared_files ['POSIX_SIZE_WRITE_0_100' ].sum () +
824+ shared_files ['POSIX_SIZE_WRITE_100_1K' ].sum () +
825+ shared_files ['POSIX_SIZE_WRITE_1K_10K' ].sum () +
826+ shared_files ['POSIX_SIZE_WRITE_10K_100K' ].sum () +
827+ shared_files ['POSIX_SIZE_WRITE_100K_1M' ].sum ()
828+ )
785829
786- shared_files ['INSIGHTS_POSIX_SMALL_WRITES' ] = shared_files ['POSIX_SIZE_WRITE_0_100' ] + shared_files ['POSIX_SIZE_WRITE_1K_10K' ] + shared_files ['POSIX_SIZE_WRITE_100K_1M' ]
830+ shared_files ['INSIGHTS_POSIX_SMALL_WRITES' ] = (
831+ shared_files ['POSIX_SIZE_WRITE_0_100' ] +
832+ shared_files ['POSIX_SIZE_WRITE_100_1K' ] +
833+ shared_files ['POSIX_SIZE_WRITE_1K_10K' ] +
834+ shared_files ['POSIX_SIZE_WRITE_10K_100K' ] +
835+ shared_files ['POSIX_SIZE_WRITE_100K_1M' ]
836+ )
787837
788- if total_shared_writes and total_shared_writes_small / total_shared_writes > THRESHOLD_SMALL_REQUESTS :
838+ if total_shared_writes and total_shared_writes_small / total_shared_writes > THRESHOLD_SMALL_REQUESTS and total_shared_writes_small > THRESHOLD_SMALL_REQUESTS_ABSOLUTE :
789839 issue = 'Application issues a high number ({}) of small write requests to a shared file (i.e., < 1MB) which represents {:.2f}% of all shared file write requests' .format (
790840 total_shared_writes_small , total_shared_writes_small / total_shared_writes * 100.0
791841 )
@@ -1089,7 +1139,7 @@ def main():
10891139 total_mpiio_read_operations = df_mpiio ['counters' ]['MPIIO_INDEP_READS' ].sum () + df_mpiio ['counters' ]['MPIIO_COLL_READS' ].sum ()
10901140
10911141 if df_mpiio ['counters' ]['MPIIO_COLL_READS' ].sum () == 0 :
1092- if total_mpiio_read_operations :
1142+ if total_mpiio_read_operations and total_mpiio_read_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE :
10931143 issue = 'Application uses MPI-IO but it does not use collective read operations, instead it issues {} ({:.2f}%) independent read calls' .format (
10941144 df_mpiio ['counters' ]['MPIIO_INDEP_READS' ].sum (),
10951145 df_mpiio ['counters' ]['MPIIO_INDEP_READS' ].sum () / (total_mpiio_read_operations ) * 100
@@ -1100,7 +1150,7 @@ def main():
11001150 files = pd .DataFrame (df_mpiio_collective_reads .groupby ('id' ).sum ()).reset_index ()
11011151
11021152 for index , row in df_mpiio_collective_reads .iterrows ():
1103- if (row ['MPIIO_INDEP_READS' ] + row ['MPIIO_INDEP_WRITES' ]) and row ['MPIIO_INDEP_READS' ] / (row ['MPIIO_INDEP_READS' ] + row ['MPIIO_INDEP_WRITES' ]) > THRESHOLD_COLLECTIVE_OPERATIONS :
1153+ if (row ['MPIIO_INDEP_READS' ] + row ['MPIIO_INDEP_WRITES' ]) and row ['MPIIO_INDEP_READS' ] / (row ['MPIIO_INDEP_READS' ] + row ['MPIIO_INDEP_WRITES' ]) > THRESHOLD_COLLECTIVE_OPERATIONS and ( row [ 'MPIIO_INDEP_READS' ] + row [ 'MPIIO_INDEP_WRITES' ]) > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE :
11041154 detail .append (
11051155 {
11061156 'message' : '{} ({}%) of independent reads to "{}"' .format (
@@ -1136,7 +1186,7 @@ def main():
11361186 total_mpiio_write_operations = df_mpiio ['counters' ]['MPIIO_INDEP_WRITES' ].sum () + df_mpiio ['counters' ]['MPIIO_COLL_WRITES' ].sum ()
11371187
11381188 if df_mpiio ['counters' ]['MPIIO_COLL_WRITES' ].sum () == 0 :
1139- if total_mpiio_write_operations :
1189+ if total_mpiio_write_operations and total_mpiio_write_operations > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE :
11401190 issue = 'Application uses MPI-IO but it does not use collective write operations, instead it issues {} ({:.2f}%) independent write calls' .format (
11411191 df_mpiio ['counters' ]['MPIIO_INDEP_WRITES' ].sum (),
11421192 df_mpiio ['counters' ]['MPIIO_INDEP_WRITES' ].sum () / (total_mpiio_write_operations ) * 100
@@ -1147,7 +1197,7 @@ def main():
11471197 files = pd .DataFrame (df_mpiio_collective_writes .groupby ('id' ).sum ()).reset_index ()
11481198
11491199 for index , row in df_mpiio_collective_writes .iterrows ():
1150- if (row ['MPIIO_INDEP_READS' ] + row ['MPIIO_INDEP_WRITES' ]) and row ['MPIIO_INDEP_WRITES' ] / (row ['MPIIO_INDEP_READS' ] + row ['MPIIO_INDEP_WRITES' ]) > THRESHOLD_COLLECTIVE_OPERATIONS :
1200+ if (row ['MPIIO_INDEP_READS' ] + row ['MPIIO_INDEP_WRITES' ]) and row ['MPIIO_INDEP_WRITES' ] / (row ['MPIIO_INDEP_READS' ] + row ['MPIIO_INDEP_WRITES' ]) > THRESHOLD_COLLECTIVE_OPERATIONS and ( row [ 'MPIIO_INDEP_READS' ] + row [ 'MPIIO_INDEP_WRITES' ]) > THRESHOLD_COLLECTIVE_OPERATIONS_ABSOLUTE :
11511201 detail .append (
11521202 {
11531203 'message' : '{} ({}%) independent writes to "{}"' .format (
@@ -1182,12 +1232,20 @@ def main():
11821232
11831233 # Look for usage of non-block operations
11841234
1235+ # Look for HDF5 file extension
1236+
1237+ has_hdf5_extension = False
1238+
1239+ for index , row in df_mpiio ['counters' ].iterrows ():
1240+ if file_map [int (row ['id' ])].endswith ('.h5' ) or file_map [int (row ['id' ])].endswith ('.hdf5' ):
1241+ has_hdf5_extension = True
1242+
11851243 if df_mpiio ['counters' ]['MPIIO_NB_READS' ].sum () == 0 :
11861244 issue = 'Application could benefit from non-blocking (asynchronous) reads'
11871245
11881246 recommendation = []
11891247
1190- if 'H5F' in modules :
1248+ if 'H5F' in modules or has_hdf5_extension :
11911249 recommendation .append (
11921250 {
11931251 'message' : 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)' ,
@@ -1212,7 +1270,7 @@ def main():
12121270
12131271 recommendation = []
12141272
1215- if 'H5F' in modules :
1273+ if 'H5F' in modules or has_hdf5_extension :
12161274 recommendation .append (
12171275 {
12181276 'message' : 'Since you use HDF5, consider using the ASYNC I/O VOL connector (https://github.com/hpc-io/vol-async)' ,
0 commit comments