@@ -8,6 +8,7 @@ def __init__(self):
88 self .penalties = []
99 self .with_weights = False
1010 self ._ret_per_sample = False
11+ self ._nonzero_averaging = False
1112 self ._fn_forward = {}
1213
1314 def _assert_same_dim (self , symb_input , symb_target ):
@@ -45,7 +46,16 @@ def __call__(self, symb_input, symb_target, with_penalties=True):
4546 # Criteria may return per-sample cost which we will average
4647 # (optionally weighted) across samples, if necessary.
4748 if cost .ndim != 0 :
48- cost = df .T .mean (cost )
49+ # The default is to average the batch, regardless of the loss values.
50+ # But we also allow to average only over non-zero losses, for some
51+ # applications. Especially in margin-losses, this may make sense as
52+ # it effectively weights the "rare non-zero" losses higher.
53+ if self ._nonzero_averaging :
54+ cost = df .T .mean (cost )
55+ else :
56+ nnz = df .th .gradient .disconnected_grad (cost .nonzero_values ().shape [0 ])
57+ cost = df .T .sum (cost )/ (1e-8 + nnz )
58+
4959 if symb_weights is not None :
5060 # Need a very small eps to avoid 0/0 when all weights are 0!
5161 cost = cost / (1e-8 + df .T .mean (symb_weights ))
@@ -68,6 +78,10 @@ def enable_per_sample_cost(self):
6878 self ._ret_per_sample = True
6979 return self
7080
81+ def enable_nonzero_averaging (self ):
82+ self ._nonzero_averaging = True
83+ return self
84+
7185 def forward (self , num_input , num_target , with_penalties = True , per_sample = False ):
7286 # NOTE: using the GPU for such trivial computations as most costs
7387 # is actually somewhat slower (e.g. for RMSE: GPU 1.2ms vs. CPU 0.2ms).
0 commit comments