|
42 | 42 |
|
43 | 43 | insights_operation = [] |
44 | 44 | insights_metadata = [] |
| 45 | +insights_dxt = [] |
45 | 46 |
|
46 | 47 | insights_total = dict() |
47 | 48 |
|
|
95 | 96 | INSIGHTS_MPI_IO_AGGREGATORS_INTRA = 'M08' |
96 | 97 | INSIGHTS_MPI_IO_AGGREGATORS_INTER = 'M09' |
97 | 98 | INSIGHTS_MPI_IO_AGGREGATORS_OK = 'M10' |
| 99 | +INSIGHTS_DXT_RANK_ZERO_IMBALANCE = 'D01' |
| 100 | +INSIGHTS_DXT_RANK_IMBALANCE = 'D02' |
98 | 101 |
|
99 | 102 | # TODO: need to verify the threashold to be between 0 and 1 |
100 | 103 | # TODO: read thresholds from file |
|
179 | 182 | help='Export a CSV with the code of all issues that were triggered' |
180 | 183 | ) |
181 | 184 |
|
| 185 | +parser.add_argument( |
| 186 | + '--rank_zero_imbalance', |
| 187 | + default=False, |
| 188 | + action='store_true', |
| 189 | + dest='rank_zero_imbalance', |
| 190 | + help=argparse.SUPPRESS) |
| 191 | + |
| 192 | +parser.add_argument( |
| 193 | + '--unbalanced_workload', |
| 194 | + default=False, |
| 195 | + action='store_true', |
| 196 | + dest='unbalanced_workload', |
| 197 | + help=argparse.SUPPRESS) |
| 198 | + |
182 | 199 | args = parser.parse_args() |
183 | 200 |
|
184 | 201 | if args.export_size: |
@@ -1437,7 +1454,43 @@ def main(): |
1437 | 1454 | pass |
1438 | 1455 | except FileNotFoundError: |
1439 | 1456 | pass |
| 1457 | + |
| 1458 | + ######################################################################################################################################################################### |
| 1459 | + |
| 1460 | + if args.rank_zero_imbalance: |
| 1461 | + issue = 'Rank 0 is issuing a lot of I/O requests' |
| 1462 | + |
| 1463 | + recommendation = [ |
| 1464 | + { |
| 1465 | + 'message': 'Consider using MPI-IO collective' |
| 1466 | + } |
| 1467 | + ] |
| 1468 | + |
| 1469 | + insights_dxt.append( |
| 1470 | + message(INSIGHTS_DXT_RANK_ZERO_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation) |
| 1471 | + ) |
| 1472 | + |
| 1473 | + ######################################################################################################################################################################### |
| 1474 | + |
| 1475 | + if args.unbalanced_workload: |
| 1476 | + issue = 'Detected unbalanced workload between the ranks' |
| 1477 | + |
| 1478 | + recommendation = [ |
| 1479 | + { |
| 1480 | + 'message': 'Consider better balancing the data transfer between the application ranks' |
| 1481 | + }, |
| 1482 | + { |
| 1483 | + 'message': 'Consider tuning the stripe size and count to better distribute the data' |
| 1484 | + }, |
| 1485 | + { |
| 1486 | + 'message': 'If the application uses netCDF and HDF5, double check the need to set NO_FILL values' |
| 1487 | + } |
| 1488 | + ] |
1440 | 1489 |
|
| 1490 | + insights_dxt.append( |
| 1491 | + message(INSIGHTS_DXT_RANK_IMBALANCE, TARGET_DEVELOPER, HIGH, issue, recommendation) |
| 1492 | + ) |
| 1493 | + |
1441 | 1494 | ######################################################################################################################################################################### |
1442 | 1495 |
|
1443 | 1496 | insights_end_time = time.time() |
@@ -1527,6 +1580,20 @@ def main(): |
1527 | 1580 | ) |
1528 | 1581 | ) |
1529 | 1582 |
|
| 1583 | + if insights_dxt: |
| 1584 | + console.print( |
| 1585 | + Panel( |
| 1586 | + Padding( |
| 1587 | + Group( |
| 1588 | + *insights_dxt |
| 1589 | + ), |
| 1590 | + (1, 1) |
| 1591 | + ), |
| 1592 | + title='DXT', |
| 1593 | + title_align='left' |
| 1594 | + ) |
| 1595 | + ) |
| 1596 | + |
1530 | 1597 | console.print( |
1531 | 1598 | Panel( |
1532 | 1599 | ' {} | [white]LBNL[/white] | [white]Drishti report generated at {} in[/white] {:.3f} seconds'.format( |
@@ -1615,7 +1682,9 @@ def main(): |
1615 | 1682 | INSIGHTS_MPI_IO_BLOCKING_WRITE_USAGE, |
1616 | 1683 | INSIGHTS_MPI_IO_AGGREGATORS_INTRA, |
1617 | 1684 | INSIGHTS_MPI_IO_AGGREGATORS_INTER, |
1618 | | - INSIGHTS_MPI_IO_AGGREGATORS_OK |
| 1685 | + INSIGHTS_MPI_IO_AGGREGATORS_OK, |
| 1686 | + INSIGHTS_DXT_RANK_ZERO_IMBALANCE, |
| 1687 | + INSIGHTS_DXT_RANK_IMBALANCE |
1619 | 1688 | ] |
1620 | 1689 |
|
1621 | 1690 | detected_issues = dict.fromkeys(issues, False) |
|
0 commit comments