A couple experiments on dataset mixing experiments that didn't have any positive effect

AngledLuffa · AngledLuffa · commit 767d463c0c79 · 2025-10-04T22:57:19.000-07:00
lang_lr_attenuation, decays the learning rate per epoch

Do scaling as well: can set the learning rate for an individual language to be lower than default

Document how to use the flags in the --help

Add some comments on an experiment that seemed like it might have helped, but so far didn't
diff --git a/stanza/models/coref/config.py b/stanza/models/coref/config.py
@@ -67,3 +67,5 @@ class Config:  # pylint: disable=too-many-instance-attributes, too-few-public-me
     max_train_len: int
     use_zeros: bool
 
+    lang_lr_attenuation: str
+    lang_lr_weights: str
diff --git a/stanza/models/coref/coref_config.toml b/stanza/models/coref/coref_config.toml
@@ -122,6 +122,22 @@ max_train_len = 5000
 # if this is set to false, the model will set its zero_predictor to, well, 0
 use_zeros = true
 
+# two different methods for specifying how to weaken the LR for certain languages
+# however, in their current forms, on an HE experiment, neither worked
+# better than just mixing the two datasets together unweighted
+# Starting from the HE IAHLT dataset, and possibly mixing in the ger/rom ud coref,
+# averaging over 5 different seeds, we got the following results:
+#   HE only:    0.497
+#   Attenuated: 0.508
+#   Scaled:     0.517
+#   Mixed:      0.517
+# the attenuation scheme for that experiment was 1/epoch
+# These were the settings
+#   --lang_lr_weights es=0.2,en=0.2,de=0.2,ca=0.2,fr=0.2,no=0.2
+#   --lang_lr_attenuation es,en,de,ca,fr,no
+lang_lr_attenuation = ""
+lang_lr_weights = ""
+
 # =============================================================================
 # Extra keyword arguments to be passed to bert tokenizers of specified models
 [DEFAULT.tokenizer_kwargs]
diff --git a/stanza/models/coref/model.py b/stanza/models/coref/model.py
@@ -486,6 +486,19 @@ def train(self, log=False):
                 # new model, set it to always predict not-zero
                 self.disable_zeros_predictor()
 
+        attenuated_languages = set()
+        if self.config.lang_lr_attenuation:
+            attenuated_languages = self.config.lang_lr_attenuation.split(",")
+            logger.info("Attenuating LR for the following languages: %s", attenuated_languages)
+
+        lr_scaled_languages = dict()
+        if self.config.lang_lr_weights:
+            scaled_languages = self.config.lang_lr_weights.split(",")
+            for piece in scaled_languages:
+                pieces = piece.split("=")
+                lr_scaled_languages[pieces[0]] = float(pieces[1])
+            logger.info("Scaling LR for the following languages: %s", lr_scaled_languages)
+
         best_f1 = None
         for epoch in range(self.epochs_trained, self.config.train_epochs):
             self.training = True
@@ -526,6 +539,13 @@ def train(self, log=False):
                 else:
                     s_loss = torch.zeros_like(c_loss)
 
+                lr_scale = lr_scaled_languages.get(doc.get("lang"), 1.0)
+                if doc.get("lang") in attenuated_languages:
+                    lr_scale = lr_scale / max(epoch, 1.0)
+                c_loss = c_loss * lr_scale
+                s_loss = s_loss * lr_scale
+                z_loss = z_loss * lr_scale
+
                 (c_loss + s_loss + z_loss).backward()
 
                 running_c_loss += c_loss.item()
diff --git a/stanza/models/wl_coref.py b/stanza/models/wl_coref.py
@@ -134,6 +134,11 @@ def deterministic() -> None:
     argparser.add_argument("--seed", type=int, default=2020,
                            help="Random seed to set")
 
+    argparser.add_argument("--lang_lr_attenuation", type=str, default=None,
+                           help="A comma-separated list of languages where the LR will be scaled by 1/epoch, such as --lang_lr_attenuation=es,en,de,...")
+    argparser.add_argument("--lang_lr_weights", type=str, default=None,
+                           help="A comma-separated list of languages and their weights of LR scaling for different languages, such as es=0.5,en=1.0,...")
+
     argparser.add_argument("--max_train_len", type=int, default=5000,
                            help="Skip any documents longer than this maximum length")
     argparser.add_argument("--no_max_train_len", action="store_const", const=float("inf"), dest="max_train_len",
@@ -196,6 +201,11 @@ def deterministic() -> None:
     if args.max_train_len:
         config.max_train_len = args.max_train_len
 
+    if args.lang_lr_attenuation:
+        config.lang_lr_attenuation = args.lang_lr_attenuation
+    if args.lang_lr_weights:
+        config.lang_lr_weights = args.lang_lr_weights
+
     # if wandb, generate wandb configuration 
     if args.mode == "train":
         if args.wandb: