Merge pull request #8 from awwong1/issue-7

awwong1 · web-flow · commit b45f937b7909 · 2020-04-05T12:00:23.000-06:00
Sum over all occurrences per layer, display occurrences per layer
diff --git a/README.md b/README.md
@@ -24,53 +24,52 @@ with torchprof.Profile(model, use_cuda=True) as prof:
 print(prof.display(show_events=False)) # equivalent to `print(prof)` and `print(prof.display())`
 ```
 ```text
-Module         | Self CPU total | CPU total | CUDA total
----------------|----------------|-----------|-----------
-AlexNet        |                |           |
-├── features   |                |           |
-│├── 0         |        1.956ms |   7.714ms |    7.787ms
-│├── 1         |       68.880us |  68.880us |   69.632us
-│├── 2         |       85.639us | 155.948us |  155.648us
-│├── 3         |      253.419us | 970.386us |    1.747ms
-│├── 4         |       18.919us |  18.919us |   19.584us
-│├── 5         |       30.910us |  54.900us |   55.296us
-│├── 6         |      132.839us | 492.367us |  652.192us
-│├── 7         |       17.990us |  17.990us |   18.432us
-│├── 8         |       87.219us | 310.776us |  552.544us
-│├── 9         |       17.620us |  17.620us |   17.536us
-│├── 10        |       85.690us | 303.120us |  437.248us
-│├── 11        |       17.910us |  17.910us |   18.400us
-│└── 12        |       29.239us |  51.488us |   52.288us
-├── avgpool    |       49.230us |  85.740us |   88.960us
-└── classifier |                |           |
- ├── 0         |      626.236us |   1.239ms |    1.362ms
- ├── 1         |      235.669us | 235.669us |  635.008us
- ├── 2         |       17.990us |  17.990us |   18.432us
- ├── 3         |       31.890us |  56.770us |   57.344us
- ├── 4         |       39.280us |  39.280us |  212.128us
- ├── 5         |       16.800us |  16.800us |   17.600us
- └── 6         |       38.459us |  38.459us |   79.872us
+Module         | Self CPU total | CPU total | CUDA total | Occurrences
+---------------|----------------|-----------|------------|------------
+AlexNet        |                |           |            |
+├── features   |                |           |            |
+│├── 0         |        1.671ms |   6.589ms |    6.701ms |           1
+│├── 1         |       62.430us |  62.430us |   63.264us |           1
+│├── 2         |       62.909us | 109.948us |  112.640us |           1
+│├── 3         |      225.389us | 858.376us |    1.814ms |           1
+│├── 4         |       18.999us |  18.999us |   19.456us |           1
+│├── 5         |       29.560us |  52.720us |   54.272us |           1
+│├── 6         |      136.959us | 511.216us |  707.360us |           1
+│├── 7         |       18.480us |  18.480us |   18.624us |           1
+│├── 8         |       84.380us | 300.700us |  590.688us |           1
+│├── 9         |       18.249us |  18.249us |   17.632us |           1
+│├── 10        |       81.289us | 289.946us |  470.016us |           1
+│├── 11        |       17.850us |  17.850us |   18.432us |           1
+│└── 12        |       29.350us |  52.260us |   52.288us |           1
+├── avgpool    |       41.840us |  70.840us |   76.832us |           1
+└── classifier |                |           |            |
+ ├── 0         |       66.400us | 122.110us |  125.920us |           1
+ ├── 1         |      293.658us | 293.658us |  664.704us |           1
+ ├── 2         |       17.600us |  17.600us |   18.432us |           1
+ ├── 3         |       27.920us |  49.030us |   51.168us |           1
+ ├── 4         |       40.590us |  40.590us |  208.672us |           1
+ ├── 5         |       17.570us |  17.570us |   18.432us |           1
+ └── 6         |       40.489us |  40.489us |   81.920us |           1
 ```
 
 To see the low level operations that occur within each layer, print the contents of  `prof.display(show_events=True)`.
 
 ```text
-Module                        | Self CPU total | CPU total | CUDA total
-------------------------------|----------------|-----------|-----------
-AlexNet                       |                |           |
-├── features                  |                |           |
-│├── 0                        |                |           |
-││├── conv2d                  |       15.740us |   1.956ms |    1.972ms
-││├── convolution             |       12.000us |   1.940ms |    1.957ms
-││├── _convolution            |       36.590us |   1.928ms |    1.946ms
-││├── contiguous              |        6.600us |   6.600us |    6.464us
-││└── cudnn_convolution       |        1.885ms |   1.885ms |    1.906ms
-│├── 1                        |                |           |
-││└── relu_                   |       68.880us |  68.880us |   69.632us
-│├── 2                        |                |           |
-││├── max_pool2d              |       15.330us |  85.639us |   84.992us
-││└── max_pool2d_with_indices |       70.309us |  70.309us |   70.656us
-│├── 3                        |                |           |
+Module                        | Self CPU total | CPU total | CUDA total | Occurrences
+------------------------------|----------------|-----------|------------|------------
+AlexNet                       |                |           |            |
+├── features                  |                |           |            |
+│├── 0                        |                |           |            |
+││├── conv2d                  |       13.370us |   1.671ms |    1.698ms |           1
+││├── convolution             |       12.730us |   1.658ms |    1.685ms |           1
+││├── _convolution            |       30.660us |   1.645ms |    1.673ms |           1
+││├── contiguous              |        6.970us |   6.970us |    7.136us |           1
+││└── cudnn_convolution       |        1.608ms |   1.608ms |    1.638ms |           1
+│├── 1                        |                |           |            |
+││└── relu_                   |       62.430us |  62.430us |   63.264us |           1
+│├── 2                        |                |           |            |
+││├── max_pool2d              |       15.870us |  62.909us |   63.488us |           1
+││└── max_pool2d_with_indices |       47.039us |  47.039us |   49.152us |           1
 ...
 ```
 
@@ -85,17 +84,17 @@ print(trace[2])
 print(event_lists_dict[trace[2].path][0])
 ```
 ```text
----------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
-Name                   Self CPU total %   Self CPU total      CPU total %        CPU total     CPU time avg     CUDA total %       CUDA total    CUDA time avg  Number of Calls
----------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
-conv2d                           0.80%         15.740us          100.00%          1.956ms          1.956ms           25.32%          1.972ms          1.972ms                1
-convolution                      0.61%         12.000us           99.20%          1.940ms          1.940ms           25.14%          1.957ms          1.957ms                1
-_convolution                     1.87%         36.590us           98.58%          1.928ms          1.928ms           24.99%          1.946ms          1.946ms                1
-contiguous                       0.34%          6.600us            0.34%          6.600us          6.600us            0.08%          6.464us          6.464us                1
-cudnn_convolution               96.37%          1.885ms           96.37%          1.885ms          1.885ms           24.47%          1.906ms          1.906ms                1
----------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------
-Self CPU time total: 1.956ms
-CUDA time total: 7.787ms
+---------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  -----------------------------------
+Name                   Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     CUDA total %     CUDA total       CUDA time avg    Number of Calls  Input Shapes
+---------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  -----------------------------------
+conv2d                 0.80%            13.370us         100.00%          1.671ms          1.671ms          25.34%           1.698ms          1.698ms          1                []
+convolution            0.76%            12.730us         99.20%           1.658ms          1.658ms          25.15%           1.685ms          1.685ms          1                []
+_convolution           1.83%            30.660us         98.44%           1.645ms          1.645ms          24.97%           1.673ms          1.673ms          1                []
+contiguous             0.42%            6.970us          0.42%            6.970us          6.970us          0.11%            7.136us          7.136us          1                []
+cudnn_convolution      96.19%           1.608ms          96.19%           1.608ms          1.608ms          24.44%           1.638ms          1.638ms          1                []
+---------------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  -----------------------------------
+Self CPU time total: 1.671ms
+CUDA time total: 6.701ms
 
 ```
 
@@ -115,32 +114,32 @@ print(prof)
 ```
 
 ```text
-Module         | Self CPU total | CPU total | CUDA total
----------------|----------------|-----------|-----------
-AlexNet        |                |           |           
-├── features   |                |           |           
-│├── 0         |                |           |           
-│├── 1         |                |           |           
-│├── 2         |                |           |           
-│├── 3         |        2.846ms |  11.368ms |    0.000us
-│├── 4         |                |           |           
-│├── 5         |                |           |           
-│├── 6         |                |           |           
-│├── 7         |                |           |           
-│├── 8         |                |           |           
-│├── 9         |                |           |           
-│├── 10        |                |           |           
-│├── 11        |                |           |           
-│└── 12        |                |           |           
-├── avgpool    |                |           |           
-└── classifier |       12.016ms |  12.206ms |    0.000us
- ├── 0         |                |           |           
- ├── 1         |                |           |           
- ├── 2         |                |           |           
- ├── 3         |                |           |           
- ├── 4         |                |           |           
- ├── 5         |                |           |           
- └── 6         |                |           |           
+Module         | Self CPU total | CPU total | CUDA total | Occurrences
+---------------|----------------|-----------|------------|------------
+AlexNet        |                |           |            |
+├── features   |                |           |            |
+│├── 0         |                |           |            |
+│├── 1         |                |           |            |
+│├── 2         |                |           |            |
+│├── 3         |        3.189ms |  12.717ms |    0.000us |           1
+│├── 4         |                |           |            |
+│├── 5         |                |           |            |
+│├── 6         |                |           |            |
+│├── 7         |                |           |            |
+│├── 8         |                |           |            |
+│├── 9         |                |           |            |
+│├── 10        |                |           |            |
+│├── 11        |                |           |            |
+│└── 12        |                |           |            |
+├── avgpool    |                |           |            |
+└── classifier |       13.403ms |  14.011ms |    0.000us |           1
+ ├── 0         |                |           |            |
+ ├── 1         |                |           |            |
+ ├── 2         |                |           |            |
+ ├── 3         |                |           |            |
+ ├── 4         |                |           |            |
+ ├── 5         |                |           |            |
+ └── 6         |                |           |            |
 
 ```
 
diff --git a/torchprof/profile.py b/torchprof/profile.py
@@ -3,7 +3,7 @@
 from collections import namedtuple, defaultdict, OrderedDict
 
 Trace = namedtuple("Trace", ["path", "leaf", "module"])
-Measure = namedtuple("Measure", ["self_cpu_total", "cpu_total", "cuda_total"])
+Measure = namedtuple("Measure", ["self_cpu_total", "cpu_total", "cuda_total", "occurrences"])
 
 
 def walk_modules(module, name="", path=()):
@@ -133,16 +133,18 @@ def traces_to_display(traces, trace_events, show_events=False, paths=None):
                     for event in events:
                         current_tree[name][event.name] = {
                             None: Measure(
-                                event.self_cpu_time_total,
-                                event.cpu_time_total,
-                                event.cuda_time_total,
+                                sum([e.self_cpu_time_total for e in events if e.name == event.name]),
+                                sum([e.cpu_time_total for e in events if e.name == event.name]),
+                                sum([e.cuda_time_total for e in events if e.name == event.name]),
+                                len([e for e in events if e.name == event.name])
                             )
                         }
                 else:
                     current_tree[name][None] = Measure(
                         sum([e.self_cpu_time_total for e in events]),
                         sum([e.cpu_time_total for e in events]),
                         sum([e.cuda_time_total for e in events]),
+                        len(trace_events[path])
                     )
             current_tree = current_tree[name]
     tree_lines = flatten_tree(tree)
@@ -155,10 +157,12 @@ def traces_to_display(traces, trace_events, show_events=False, paths=None):
         self_cpu_time = ""
         cpu_time = ""
         cuda_time = ""
+        occurrences = ""
         if measures:
             self_cpu_time = tprofiler.format_time(measures.self_cpu_total)
             cpu_time = tprofiler.format_time(measures.cpu_total)
             cuda_time = tprofiler.format_time(measures.cuda_total)
+            occurrences = str(measures.occurrences)
         pre = ""
         next_depths = [pl[0] for pl in tree_lines[idx + 1 :]]
         current = True
@@ -175,22 +179,24 @@ def traces_to_display(traces, trace_events, show_events=False, paths=None):
                     pre = dt[3] + pre
             depth -= 1
             current = False
-        format_lines.append([pre + name, self_cpu_time, cpu_time, cuda_time])
+        format_lines.append([pre + name, self_cpu_time, cpu_time, cuda_time, occurrences])
 
     # construct the table
-    heading = ("Module", "Self CPU total", "CPU total", "CUDA total")
+    heading = ("Module", "Self CPU total", "CPU total", "CUDA total", "Occurrences")
     max_lens = [max(map(len, col)) for col in zip(*([heading] + format_lines))]
     # create the heading
     disp = "{:<{}s}".format(heading[0], max_lens[0]) + " | "
     disp += "{:>{}s}".format(heading[1], max_lens[1]) + " | "
     disp += "{:>{}s}".format(heading[2], max_lens[2]) + " | "
-    disp += "{:>{}s}".format(heading[3], max_lens[3]) + "\n"
+    disp += "{:>{}s}".format(heading[3], max_lens[3]) + " | "
+    disp += "{:>{}s}".format(heading[4], max_lens[4]) + "\n"
     disp += "-|-".join(["-" * mlen for mlen in max_lens]) + "\n"
     for line in format_lines:
-        label, self_cpu_time, cpu_time, cuda_time = line
+        label, self_cpu_time, cpu_time, cuda_time, occurrences = line
         disp += "{:<{}s}".format(label, max_lens[0]) + " | "
         disp += "{:>{}s}".format(self_cpu_time, max_lens[1]) + " | "
         disp += "{:>{}s}".format(cpu_time, max_lens[2]) + " | "
-        disp += "{:>{}s}".format(cuda_time, max_lens[3]) + "\n"
+        disp += "{:>{}s}".format(cuda_time, max_lens[3]) + " | "
+        disp += "{:>{}s}".format(occurrences, max_lens[4]) + "\n"
 
     return disp