Begin attempting to port the distance calculation to the transition parser

AngledLuffa · AngledLuffa · commit 7ba55d294da6 · 2026-02-26T22:06:59.000-08:00
Connect the distance to the outputs using a flag for the scaling factor
diff --git a/stanza/models/depparse/transition/model.py b/stanza/models/depparse/transition/model.py
@@ -2,8 +2,10 @@
 
 import torch
 from torch import nn
+import torch.nn.functional as F
 from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 
+from stanza.models.common.biaffine import DeepBiaffineScorer
 from stanza.models.common.utils import build_nonlinearity, unsort
 from stanza.models.common.vocab import VOCAB_PREFIX_SIZE
 from stanza.models.depparse.model import BaseParser, EmbeddingParser
@@ -211,6 +213,10 @@ def __init__(self, args, vocab, emb_matrix=None, foundation_cache=None, bert_mod
         self.transition_loss_function = nn.CrossEntropyLoss(reduction='sum')
         self.deprel_loss_function = nn.CrossEntropyLoss(reduction='sum')
 
+        # self.args['distance_output_dim']
+        self.distance = DeepBiaffineScorer(2 * self.args['hidden_dim'], 2 * self.args['hidden_dim'], self.args['deep_biaff_hidden_dim'], 1, pairwise=True, dropout=self.args['dropout'])
+        #self.distance_expansion = nn.Linear(self.args['distance_output_dim'], self.merge_hidden_dim)
+
     def forward(self, states):
         """
         Builds a list of logits for the different operations, including a separate one for each Left and Right merge 
@@ -300,7 +306,8 @@ def forward(self, states):
                 attachment_input_left = attachment_input.expand(state.word_position, attachment_input.shape[0])
                 left_arc_hx = torch.cat([attachment_input_left, attachment_embeddings_left], axis=1)
                 left_arc_hx = self.merge_output_left(left_arc_hx)
-                left_output = self.output_left_transition(self.drop(self.nonlinearity(left_arc_hx)))
+                distance_left = state.distance[0, 1:state.word_position+1, state.current_heads[-1]].unsqueeze(1).detach()
+                left_output = self.output_left_transition(self.drop(self.nonlinearity(left_arc_hx))) + distance_left * self.args['distance_factor']
                 left_deprel = self.output_left_deprel(self.drop(self.nonlinearity(left_arc_hx)))
 
                 # truncate the outputs to only be the current heads,
@@ -311,7 +318,8 @@ def forward(self, states):
                 attachment_input_right = attachment_input.unsqueeze(0).expand(current_heads.shape[0], attachment_input.shape[0])
                 right_arc_hx = torch.cat([attachment_input_right, attachment_embeddings_right], axis=1)
                 right_arc_hx = self.merge_output_right(right_arc_hx)
-                right_output = self.output_right_transition(self.drop(self.nonlinearity(right_arc_hx)))
+                distance_right = state.distance[0, state.current_heads[-1], :][current_heads].unsqueeze(1).detach()
+                right_output = self.output_right_transition(self.drop(self.nonlinearity(right_arc_hx))) + distance_right * self.args['distance_factor']
                 right_deprel = self.output_right_deprel(self.drop(self.nonlinearity(right_arc_hx)))
                 final_output[state_idx] = [final_output[state_idx][0], left_output.squeeze(1), right_output.squeeze(1)]
             left_deprels.append(left_deprel)
@@ -598,6 +606,10 @@ def loss(self, word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, p
         states = self.build_initial_states(head, deprel, text, lstm_outputs, sentlens)
 
         total_loss = 0
+        for state, sentence_head in zip(states, head):
+            dist_kld = torch.gather(state.distance[0, 1:, :], 1, sentence_head[:state.num_words].unsqueeze(1))
+            # definitely not +=... that model is completely broken
+            total_loss -= dist_kld.sum()
 
         iteration = 0
         while len(states) > 0:
@@ -722,12 +734,24 @@ def build_initial_states(self, head, deprel, text, lstm_outputs, sentlens):
         else:
             states = [state_from_text(sentence) for sentence in text]
         updated_states = []
+
         # TODO: list comprehension?
         for state, lstm_output, sentlen in zip(states, lstm_outputs, sentlens):
             # the sentences are all prepended with root
             # which is fine, since we need an embedding for word 0
+            # for distance, the graph parser uses the extra space with the root
+            # TODO: stack the distance operation
+            head_offset = (torch.arange(sentlen, device=lstm_output.device).view(1, 1, -1) -
+                           torch.arange(sentlen, device=lstm_output.device).view(1, -1, 1))
+            distance_scores = self.distance(self.drop(lstm_output[:sentlen].unsqueeze(0)),
+                                            self.drop(lstm_output[:sentlen].unsqueeze(0))).squeeze(3).squeeze(0)
+            distance_pred = 1 + F.softplus(distance_scores)
+            distance_target = torch.abs(head_offset)
+            distance_kld = -torch.log((distance_target.float() - distance_pred)**2/2 + 1)
+
             state = state._replace(word_embeddings=lstm_output,
-                                   subtree_embeddings={})
+                                   subtree_embeddings={},
+                                   distance=distance_kld)
             updated_states.append(state)
         return updated_states
 
diff --git a/stanza/models/depparse/transition/state.py b/stanza/models/depparse/transition/state.py
@@ -21,7 +21,8 @@
 # transition_lstm_embeddings is a list of the above TransitionLSTMEmbedding namedtuple - one per transition
 State = namedtuple('State', ['transitions', 'parsed_graph', 'word_position', 'num_words', 'current_heads',
                              'gold_graph', 'gold_sequence', 'word_embeddings', 'subtree_embeddings',
-                             'transition_lstm_embeddings', 'subtree_lstm_embeddings'])
+                             'transition_lstm_embeddings', 'subtree_lstm_embeddings',
+                             'distance'])
 
 def is_nonproj(gold_graph, node, pred):
     for middle in range(node+1, pred):
@@ -33,7 +34,7 @@ def is_nonproj(gold_graph, node, pred):
 
 def build_gold_sequence(gold_graph):
     num_words = len(gold_graph.nodes()) - 1
-    state = State([], nx.DiGraph(), 0, num_words, [], None, None, None, None, [], [])
+    state = State([], nx.DiGraph(), 0, num_words, [], None, None, None, None, [], [], None)
 
     # determine which arcs are non-projective
     # key is the head, value is a set of the children which are non-proj
@@ -123,7 +124,7 @@ def state_from_graph(gold_graph):
 
     gold_sequence = build_gold_sequence(gold_graph)
     num_words = len(gold_graph.nodes()) - 1
-    return State(transitions, empty_graph, 0, num_words, [], gold_graph, gold_sequence, None, None, [], [])
+    return State(transitions, empty_graph, 0, num_words, [], gold_graph, gold_sequence, None, None, [], [], None)
 
 def from_gold(sentence):
     gold_graph = nx.DiGraph()
@@ -160,4 +161,4 @@ def state_from_text(text):
     transitions = []
     num_words = len(text)
     empty_graph = nx.DiGraph()
-    return State(transitions, empty_graph, 0, num_words, [], None, None, None, None, [], [])
+    return State(transitions, empty_graph, 0, num_words, [], None, None, None, None, [], [], None)
diff --git a/stanza/models/parser.py b/stanza/models/parser.py
@@ -89,6 +89,8 @@ def build_argparse():
     parser.add_argument('--char_hidden_dim', type=int, default=400)
     parser.add_argument('--deep_biaff_hidden_dim', type=int, default=400)
     parser.add_argument('--deep_biaff_output_dim', type=int, default=160)
+    parser.add_argument('--distance_output_dim', type=int, default=1)
+    parser.add_argument('--distance_factor', type=float, default=0.1, help="How much weight to put on the distance")
     # As an additional option, we implement arc embeddings
     #  described in https://arxiv.org/pdf/2501.09451
     #  Scaling Graph-Based Dependency Parsing with Arc Vectorization and Attention-Based Refinement