niklases
diff --git a/‎pypef/hybrid/hybrid_model.py‎
Lines changed: 5 additions & 5 deletions b/‎pypef/hybrid/hybrid_model.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎pypef/plm/esm_lora_tune.py‎
Lines changed: 0 additions & 311 deletions b/‎pypef/plm/esm_lora_tune.py‎
Lines changed: 0 additions & 311 deletions
@@ -409,8 +409,8 @@ def get_subsplits_train(self, train_size_fit: float = 0.66):
     def train_llm(self):
         # LoRA training on y_llm_ttrain --> Testing on y_llm_ttest 
         x_llm_ttrain_b, scores_ttrain_b = (
-            get_batches(self.x_llm_ttrain, batch_size=self.batch_size, dtype=int), 
-            get_batches(self.y_ttrain, batch_size=self.batch_size, dtype=float)
+            torch.from_numpy(get_batches(self.x_llm_ttrain, batch_size=self.batch_size, dtype=int)), 
+            torch.from_numpy(get_batches(self.y_ttrain, batch_size=self.batch_size, dtype=float))
         )
 
         if self.llm_key == 'prosst':
@@ -431,7 +431,7 @@ def train_llm(self):
                 device=self.device
             )
         elif self.llm_key == 'esm1v':
-            x_llm_ttest_b = get_batches(self.x_llm_ttest, batch_size=1, dtype=int)
+            x_llm_ttest_b = torch.from_numpy(get_batches(self.x_llm_ttest, batch_size=1, dtype=int))
             y_llm_ttest = self.llm_inference_function(
                 xs=x_llm_ttest_b,
                 model=self.llm_model,
@@ -633,7 +633,7 @@ def hybrid_prediction(
                     verbose=verbose,
                     device=self.device).detach().cpu().numpy()
             elif self.llm_key == 'esm1v':
-                x_llm_b = get_batches(x_llm, batch_size=1, dtype=int)
+                x_llm_b = torch.from_numpy(get_batches(x_llm, batch_size=1, dtype=int))
                 y_llm = self.llm_inference_function(
                     x_llm_b, 
                     self.llm_attention_mask,
@@ -662,7 +662,7 @@ def hybrid_prediction(
     def ls_ts_performance(self):
         beta_1, beta_2, reg = self.settings(
             x_train=self.x_train,
-            y_train=self.y_train
+            y_train=self.y_traing
         )
         spearman_r = self.spearmanr(
             self.y_test,
 
@@ -18,16 +18,13 @@
 
 import logging
 
-from pypef.plm.prosst_lora_tune import get_logits_from_full_seqs
 logger = logging.getLogger('pypef.llm.esm_lora_tune')
 
 import torch
-import torch.nn.functional as F
 import numpy as np
 from scipy.stats import spearmanr
 from tqdm import tqdm
 
-
 from peft import LoraConfig, get_peft_model
 from transformers import logging as hf_logging
 hf_logging.set_verbosity_error()
@@ -143,314 +140,6 @@ def esm_infer(xs, attention_mask, model, device: str | None = None, verbose=Fals
     return torch.flatten(y_preds_total)
 
 
-def unmasked_wt_score(
-        tokenized_sequences, 
-        attention_mask, 
-        wt_input_ids,
-        model, 
-        train: bool = False,
-        cut_special_tokens: bool = True,  # assumption: cut first and last token
-        device=None,
-        **kwargs
-    ):
-    if device is None:
-        device = get_device()
-    if wt_input_ids.dim() == 1:
-        wt_input_ids = wt_input_ids.unsqueeze(0)
-    structure_input_ids = kwargs.get("structure_input_ids", None)
-
-    attention_masks = torch.Tensor(np.full(
-        shape=np.shape(wt_input_ids), fill_value=attention_mask)).to(torch.int64)
-    if train:
-        if structure_input_ids is not None:
-            outputs = model(
-                input_ids=wt_input_ids.to(device),
-                attention_mask=attention_masks.to(device),
-                ss_input_ids=structure_input_ids.to(device)
-            )
-        else:
-            outputs = model(
-                wt_input_ids.to(device), 
-                attention_masks.to(device), 
-                output_hidden_states=False
-            )
-    else:
-        with torch.no_grad():
-            if structure_input_ids is not None:
-                outputs = model(
-                    input_ids=wt_input_ids.to(device),
-                    attention_mask=attention_masks.to(device),
-                    ss_input_ids=structure_input_ids.to(device)
-                )
-            else:
-                outputs = model(
-                    wt_input_ids.to(device), 
-                    attention_masks.to(device), 
-                    output_hidden_states=False,
-                )
-
-    logits = outputs.logits
-    logits = logits.squeeze(0)   # remove batch dim
-    # Better make sure that special tokens are always removed / masked 
-    # and only pure amino acid sequence tokens are present / unmasked
-    tokenized_seq_len = tokenized_sequences.shape[1]
-    if cut_special_tokens:
-        logits = logits[1:-1]        # drop CLS/EOS
-        tokenized_seq_len -= 2
-    token_probs = torch.log_softmax(logits, dim=-1)
-    assert tokenized_seq_len == token_probs.shape[0], (
-        f"{tokenized_seq_len} != {token_probs.shape[0]}")
-
-    log_probs = []
-    for tokenized_seq in tokenized_sequences:
-        if cut_special_tokens:
-            tokenized_seq = tokenized_seq[1:-1]
-    
-        seq_lp = token_probs[
-            torch.arange(tokenized_seq.shape[0], device=tokenized_seq.device),
-            tokenized_seq
-        ].sum(dtype=torch.float64)
-
-        log_probs.append(seq_lp)
-    
-    log_probs = torch.stack(log_probs)
-    return log_probs
-
-
-def esm_mutation_only_mutation_masked_pll(
-    tokenized_sequences: torch.Tensor,        # (L,)
-    wt_input_ids: torch.Tensor,     # (L,)
-    attention_mask: torch.Tensor,   # (L,)
-    model,
-    mask_token_id: int,
-    train: bool = False,
-    device: str | None = None,
-    verbose: bool = False,
-    **kwargs
-):
-    """
-    Correct mutation-only pseudo-log-likelihood for sequences.
-    """
-    tokenized_sequences = tokenized_sequences.to(device)
-    structure_input_ids = kwargs.get("structure_input_ids", None)
-    if structure_input_ids is not None:
-        assert structure_input_ids.shape[1] == tokenized_sequences.shape[1], (
-            f"{structure_input_ids.shape[1]} != {tokenized_sequences.shape[1]}")
-        structure_input_ids = structure_input_ids.to(device)
-    if wt_input_ids.dim() == 2 and wt_input_ids.shape[0] == 1:
-        wt_input_ids = wt_input_ids.squeeze(0)
-    wt_input_ids = wt_input_ids.to(device)
-    if attention_mask.dim() == 2 and attention_mask.shape[0] == 1:
-        attention_mask = attention_mask.squeeze(0)
-    attention_mask = attention_mask.to(device)
-    plls = torch.empty(len(tokenized_sequences), device=device)
-    for i, tokenized_seq in enumerate(tokenized_sequences):
-        assert tokenized_seq.dim() == 1
-        assert wt_input_ids.dim() == 1
-        assert attention_mask.dim() == 1
-        assert tokenized_seq.shape == wt_input_ids.shape == attention_mask.shape
-        pll = torch.tensor(0.0, device=device)
-
-        # Identify mutated positions (exclude padding, CLS, EOS)
-        diff = (tokenized_seq != wt_input_ids) & (attention_mask == 1)
-        diff[0] = False
-        diff[-1] = False
-
-        mutated_positions = diff.nonzero(as_tuple=False).flatten()
-        # n_mutations = (tokenized_seq != wt_input_ids).sum().item()
-        # Mutated positions: [int(m) - 1 for m in mutated_positions.cpu()]  # Remove CLS token position
-
-        for pos in tqdm(
-            mutated_positions,
-            desc="Masked PLL (single sequence)",
-            disable=not verbose
-        ):
-            masked_input_ids = tokenized_seq.clone()
-            masked_input_ids[pos] = mask_token_id
-            if structure_input_ids is not None:
-                masked_ss_input_ids = structure_input_ids.clone()
-                masked_ss_input_ids[0, pos] = mask_token_id
-
-            if train:
-                if structure_input_ids is not None:
-                    outputs = model(
-                        input_ids=masked_input_ids.unsqueeze(0),
-                        attention_mask=attention_mask.unsqueeze(0),
-                        ss_input_ids=masked_ss_input_ids  # Check
-                    )
-                else:
-                    outputs = model(
-                        input_ids=masked_input_ids.unsqueeze(0),
-                        attention_mask=attention_mask.unsqueeze(0),
-                        output_hidden_states=False
-                    )
-            else:
-                with torch.no_grad():
-                    if structure_input_ids is not None:
-                        outputs = model(
-                            input_ids=masked_input_ids.unsqueeze(0),
-                            attention_mask=attention_mask.unsqueeze(0),
-                            ss_input_ids=masked_ss_input_ids  # Check
-                        )
-                    else:
-                        outputs = model(
-                            input_ids=masked_input_ids.unsqueeze(0),
-                            attention_mask=attention_mask.unsqueeze(0),
-                            output_hidden_states=False
-                        )
-            logits = outputs.logits  # (1, L, V)
-
-            log_probs = F.log_softmax(logits[0, pos], dim=-1)
-            true_token = tokenized_seq[pos]
-
-            pll = pll + log_probs[true_token]
-        
-        plls[i] = pll
-
-    return plls
-
-
-def esm_mutation_all_pos_masked_pll(
-    tokenized_sequences: torch.Tensor,        # (L,)
-    attention_mask: torch.Tensor,   # (L,)
-    model,
-    mask_token_id: int,
-    train: bool = False,
-    device: str | None = None,
-    verbose: bool = False,
-    **kwargs
-):
-    """
-    Correct mutation-only pseudo-log-likelihood for sequences.
-    """
-    structure_input_ids = kwargs.get("structure_input_ids", None)
-    if structure_input_ids is not None:
-        assert structure_input_ids.shape[1] == tokenized_sequences.shape[1], (
-            f"{structure_input_ids.shape[1]} != {tokenized_sequences.shape[1]}")
-        structure_input_ids = structure_input_ids.to(device)
-    tokenized_sequences = tokenized_sequences.to(device)
-    if attention_mask.dim() == 2 and attention_mask.shape[0] == 1:
-        attention_mask = attention_mask.squeeze(0)
-    attention_mask = attention_mask.to(device)
-    plls = torch.empty(len(tokenized_sequences), device=device)
-    for i, tokenized_seq in enumerate(tokenized_sequences):
-        L = tokenized_seq.shape[0]
-        pll = torch.tensor(0.0, device=device)
-
-        # Positions to score: all real tokens except CLS/EOS
-        positions = (attention_mask == 1).nonzero(as_tuple=False).flatten()
-        positions = positions[(positions != 0) & (positions != L - 1)]
-
-
-        for pos in tqdm(
-            positions,
-            desc="Masked PLL (single sequence)",
-            disable=not verbose
-        ):
-            masked_input_ids = tokenized_seq.clone()
-            masked_input_ids[pos] = mask_token_id
-
-            if structure_input_ids is not None:
-                masked_ss_input_ids = structure_input_ids.clone()
-                masked_ss_input_ids[0, pos] = mask_token_id
-
-            if train:
-                if structure_input_ids is not None:
-                    outputs = model(
-                        input_ids=masked_input_ids.unsqueeze(0),
-                        attention_mask=attention_mask.unsqueeze(0),
-                        ss_input_ids=masked_ss_input_ids  # Check 
-                    )
-                else:
-                    outputs = model(
-                        input_ids=masked_input_ids.unsqueeze(0), 
-                        attention_mask=attention_mask.unsqueeze(0), 
-                        output_hidden_states=False
-                    )
-            else:
-                with torch.no_grad():
-                    if structure_input_ids is not None:
-                        outputs = model(
-                                input_ids=masked_input_ids.unsqueeze(0),
-                                attention_mask=attention_mask.unsqueeze(0),
-                                ss_input_ids=masked_ss_input_ids  # Check
-                        )
-                    else:
-                        outputs = model(
-                            input_ids=masked_input_ids.unsqueeze(0), 
-                            attention_mask=attention_mask.unsqueeze(0), 
-                            output_hidden_states=False
-                        )
-            logits = outputs.logits  # (1, L, V)
-
-            log_probs = F.log_softmax(logits[0, pos], dim=-1)
-            true_token = tokenized_seq[pos]
-            pll = pll + log_probs[true_token]
-
-        plls[i] = pll
-
-    return plls
-
-
-def plm_inference(
-    xs,
-    wt_input_ids,
-    attention_mask,
-    model,
-    mask_token_id,
-    inference_type='unmasked',
-    wt_structure_input_ids=None,
-    batch_size=5,
-    train=False,
-    device=None,
-    verbose=False,
-):
-    if device is None:
-        device = get_device()
-
-    model = model.to(device)
-
-    if not isinstance(xs, torch.Tensor):
-        xs = torch.tensor(xs, dtype=torch.long)
-
-    if not isinstance(attention_mask, torch.Tensor):
-        attention_mask = torch.tensor(attention_mask, dtype=torch.long)
-    if inference_type == 'mutation-masking':
-        inference_function = esm_mutation_only_mutation_masked_pll
-    elif inference_type in ['full-masking', 'all-pos-masking']:
-        inference_function = esm_mutation_all_pos_masked_pll
-    elif inference_type in ['unmasked', 'wt-marginals']:
-        inference_function = unmasked_wt_score
-    else:
-        raise SystemError("Choose between 'mutation-masking', 'unmasked', and 'full-masking'")
-
-    scores = []
-
-    xs_b = get_batches(xs, dtype=int, batch_size=batch_size, keep_remaining=True, verbose=True)
-    desc = f"Inference: {inference_type} batch (size={batch_size}) processing ({device.upper()})'"
-
-    pbar = tqdm(
-        range(len(xs_b)),
-        desc=desc,
-        disable=not verbose
-    )
-
-    for i in pbar:
-        pll = inference_function(
-            tokenized_sequences=torch.tensor(xs_b[i]),
-            wt_input_ids=wt_input_ids,
-            structure_input_ids=wt_structure_input_ids,
-            attention_mask=attention_mask,
-            model=model,
-            mask_token_id=mask_token_id,
-            train=train,
-            device=device,
-            verbose=False
-        )
-        scores.append(pll)
-    return torch.cat(scores)
-
-
 def esm_train(
         xs, attention_mask, scores, loss_fn, model, optimizer, n_epochs=3, 
         device: str | None = None, seed: int | None = None,