FasterDecoding
diff --git a/‎medusa/model/utils.py‎
Lines changed: 155 additions & 33 deletions b/‎medusa/model/utils.py‎
Lines changed: 155 additions & 33 deletions
@@ -1,4 +1,5 @@
 import torch
+import torch.nn.functional as F
 
 TOPK=10 # topk for sparse tree
 
@@ -192,8 +193,37 @@ def reset_past_key_values(passed_key_values):
             passed_key_values[i][j].current_length.fill_(0)
     return passed_key_values
 
+def get_nucleus_one_token(logit, temperature, top_p):
+    # input [nxC]
+    logit = logit[:, :-1] / temperature
+    probs = torch.softmax(logit, dim=-1)
+    sorted_logits, sorted_indices = torch.sort(probs, descending=True)
+    cum_probs = torch.cumsum(sorted_logits, dim=-1)
+    sorted_indices_to_remove = cum_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
+    logit[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logit, dim=-1), 1)
+    return sampled_tokens
+
+def get_typical_one_token(logit, temperature, posterior_threshold, posterior_alpha):
+    # input [nxC]
+    logit = logit[:, :-1] / temperature
+    probs = torch.softmax(logit, dim=-1)
+    entropy = -torch.sum(
+            probs * torch.log(probs + 1e-5), dim=-1
+        )
+    threshold = torch.minimum(
+            torch.ones_like(entropy) * posterior_threshold,
+            torch.exp(-entropy) * posterior_alpha,
+        )
+    indices_to_remove = probs < threshold.unsqueeze(-1)
+    logit[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logit, dim=-1), 1)
+    return sampled_tokens
 
-def generate_candidates(medusa_logits, logits, tree_indices, retrieve_indices):
+def generate_candidates(medusa_logits, logits, tree_indices, retrieve_indices, temperature = 0, posterior_threshold=0.3, posterior_alpha = 0.09, top_p=0.8, sampling = 'typical', fast = False):
     """
     Generate candidates based on provided logits and indices.
     
@@ -208,8 +238,15 @@ def generate_candidates(medusa_logits, logits, tree_indices, retrieve_indices):
     """
 
     # Greedy decoding: Select the most probable candidate from the original logits.
-    candidates_logit = torch.argmax(logits[:, -1]).unsqueeze(0)
-
+    if temperature == 0 or fast:
+        candidates_logit = torch.argmax(logits[:, -1]).unsqueeze(0)
+    else:
+        if sampling == 'typical':
+            candidates_logit = get_typical_one_token(logits[:, -1], temperature, posterior_threshold, posterior_alpha).squeeze(0)
+        elif sampling == 'nucleus':
+            candidates_logit = get_nucleus_one_token(logits[:, -1], temperature, top_p).squeeze(0)
+        else:
+            raise NotImplementedError
     # Extract the TOPK candidates from the medusa logits.
     candidates_medusa_logits = torch.topk(medusa_logits[:, 0, -1], TOPK, dim = -1).indices
 
@@ -271,9 +308,66 @@ def tree_decoding(
     medusa_logits = tree_medusa_logits[:, 0, retrieve_indices]
     return medusa_logits, logits, outputs
 
+def get_nucleus_posterior_mask(logits, candidates, temperature, top_p):
+
+    # adapted from https://github.com/huggingface/transformers/blob/18a879f47576822aa1a5c49aecb27d89bfa5fa69/examples/run_generation.py#L79
+
+    # Apply temperature
+    
+    logits = logits[:, :-1] / temperature
+
+    n_samples, n_tokens = logits.shape[0], logits.shape[1]
+    logits = logits.view(n_samples*n_tokens, -1)
+
+    # Convert to probabilities (softmax)
+    probs = F.softmax(logits, dim=-1)
+    # Sort the probabilities
+    sorted_logits, sorted_indices = torch.sort(probs, descending=True)
+
+    # Compute cumulative probabilities
+    cum_probs = torch.cumsum(sorted_logits, dim=-1)
+
+    # Create mask for the top-p nucleus
+    sorted_indices_to_remove = cum_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+
+    indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
+
+    
+    # Remove low-probability tokens
+    logits[indices_to_remove] = float('-inf')
+    # Sample from the remaining tokens
+    sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
+    sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
+    # Create a mask for selected tokens
+    posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
+
+    return posterior_mask
+
+def get_typical_posterior_mask(logits, candidates, temperature, posterior_threshold, posterior_alpha):
+    logits = logits[:, :-1] / temperature
+    n_samples, n_tokens = logits.shape[0], logits.shape[1]
+    logits = logits.view(n_samples*n_tokens, -1)
+    probs = F.softmax(logits, dim=-1)
+    entropy = -torch.sum(
+            probs * torch.log(probs + 1e-5), dim=-1
+        )
+    threshold = torch.minimum(
+            torch.ones_like(entropy) * posterior_threshold,
+            torch.exp(-entropy) * posterior_alpha,
+        )
+    indices_to_remove = probs < threshold.unsqueeze(-1)
+    logits[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
+    sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
+    posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
+    return posterior_mask
+    
+    
 
 def evaluate_posterior(
-    logits, candidates, temperature, posterior_threshold, posterior_alpha
+    logits, candidates, temperature, posterior_threshold=0.3, posterior_alpha = 0.09, top_p=0.8, sampling = 'typical', fast = True
 ):
     """
     Evaluate the posterior probabilities of the candidates based on the provided logits and choose the best candidate.
@@ -307,36 +401,64 @@ def evaluate_posterior(
         else:
             best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
         return best_candidate, accept_length
-    # Calculate posterior probabilities and thresholds for candidate selection
-    posterior_prob = torch.softmax(logits[:, :-1] / temperature, dim=-1)
-    candidates_prob = torch.gather(
-        posterior_prob, dim=-1, index=candidates[:, 1:].unsqueeze(-1)
-    ).squeeze(-1)
-    posterior_entropy = -torch.sum(
-        posterior_prob * torch.log(posterior_prob + 1e-5), dim=-1
-    )  # torch.sum(torch.log(*)) is faster than torch.prod
-    threshold = torch.minimum(
-        torch.ones_like(posterior_entropy) * posterior_threshold,
-        torch.exp(-posterior_entropy) * posterior_alpha,
-    )
-    posterior_mask = candidates_prob > threshold
-    candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
-
-    # Choose the best candidate based on the evaluated posterior probabilities
-    accept_length = candidates_accept_length.max()
-    if accept_length == 0:
-        # If no candidates are accepted, just choose the first one
-        best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        
+    if sampling == 'typical':
+        if fast:
+            posterior_prob = torch.softmax(logits[:, :-1] / temperature, dim=-1)
+            candidates_prob = torch.gather(
+                posterior_prob, dim=-1, index=candidates[:, 1:].unsqueeze(-1)
+            ).squeeze(-1)
+            posterior_entropy = -torch.sum(
+                posterior_prob * torch.log(posterior_prob + 1e-5), dim=-1
+            )  # torch.sum(torch.log(*)) is faster than torch.prod
+            threshold = torch.minimum(
+                torch.ones_like(posterior_entropy) * posterior_threshold,
+                torch.exp(-posterior_entropy) * posterior_alpha,
+            )
+            posterior_mask = candidates_prob > threshold
+            candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+
+            # Choose the best candidate based on the evaluated posterior probabilities
+            accept_length = candidates_accept_length.max()
+            if accept_length == 0:
+                # If no candidates are accepted, just choose the first one
+                best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+            else:
+                best_candidates = torch.where(candidates_accept_length == accept_length)[0]
+                # Accept the best one according to likelihood
+                likelihood = torch.sum(
+                    torch.log(candidates_prob[best_candidates, :accept_length]), dim=-1
+                )
+                best_candidate = best_candidates[torch.argmax(likelihood)]
+            return best_candidate, accept_length
+        # Calculate posterior probabilities and thresholds for candidate selection
+        posterior_mask = get_typical_posterior_mask(logits, candidates, temperature, posterior_threshold, posterior_alpha, fast)
+        candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+        # Choose the best candidate based on the evaluated posterior probabilities
+        accept_length = candidates_accept_length.max()
+        
+        if accept_length == 0:
+            # If no candidates are accepted, just choose the first one
+            best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        else:
+            best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
+            # Accept the best one according to likelihood
+        return best_candidate, accept_length
+    
+    if sampling == 'nucleus':
+        assert top_p < 1.0 + 1e-6, "top_p should between 0 and 1"
+        posterior_mask = get_nucleus_posterior_mask(logits, candidates, temperature, top_p)
+        candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+        accept_length = candidates_accept_length.max()
+        # Choose the best candidate
+        if accept_length == 0:
+            # Default to the first candidate if none are accepted
+            best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        else:
+            best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
+        return best_candidate, accept_length
     else:
-        best_candidates = torch.where(candidates_accept_length == accept_length)[0]
-        # Accept the best one according to likelihood
-        likelihood = torch.sum(
-            torch.log(candidates_prob[best_candidates, :accept_length]), dim=-1
-        )
-        best_candidate = best_candidates[torch.argmax(likelihood)]
-    return best_candidate, accept_length
-
-
+        raise NotImplementedError
 def update_inference_inputs(
     input_ids,
     candidates,