Update llm_utils.py

david-thrower · web-flow · commit 4105067c474b · 2025-10-17T16:24:34.000-04:00
Added model / environment - agnostic CerebrosNotGPT to llmutils.
diff --git a/cerebrosllmutils/llm_utils.py b/cerebrosllmutils/llm_utils.py
@@ -310,3 +310,266 @@ def reset_state(self):
         self.total_crossentropy.assign(0.0)
         self.count.assign(0.0)
 
+@tf.keras.utils.register_keras_serializable()
+class CerebrosNotGPTConfig:
+    def __init__(self, max_sequence_length=1536, padding_token=None):
+        self.max_sequence_length = max_sequence_length
+        self.padding_token = padding_token
+
+    def get_config(self):
+        return {
+            'max_sequence_length': self.max_sequence_length,
+            'padding_token': self.padding_token
+            # NO model_0 here!
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)  # No model_0 to handle
+
+
+@tf.keras.utils.register_keras_serializable()
+class CerebrosNotGPT(tf.keras.Model):
+    def __init__(self, config, model_0=None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.max_sequence_length = config.max_sequence_length
+        self.padding_token = config.padding_token
+
+        # Handle model assignment
+        if model_0 is not None:
+            self.model = model_0
+        else:
+            # This branch is for deserialization - Keras will restore self.model automatically
+            # if it was a proper Keras layer/model that was added via self.model = some_keras_model
+            pass
+
+    def get_config(self):
+        return {
+            'config': self.config.get_config()
+            # NO model reference here!
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        config_obj = CerebrosNotGPTConfig.from_config(config['config'])
+        return cls(config=config_obj)  # Keras will handle model restoration
+
+    def call(self, inputs):
+        return self.model(inputs)
+
+    @staticmethod
+    def apply_top_k_probs(probs, k):
+        if k is None or k <= 0:
+            return probs
+        # Flatten and argsort for indices
+        sorted_indices = tf.argsort(probs, direction='DESCENDING')
+        keep_indices = sorted_indices[:k]
+        mask = tf.zeros_like(probs, dtype=tf.bool)
+        mask = tf.tensor_scatter_nd_update(mask, tf.reshape(keep_indices, (-1, 1)),
+                                           tf.ones((k,), dtype=tf.bool))
+        filtered_probs = tf.where(mask, probs, tf.zeros_like(probs))
+        # Renormalize
+        filtered_probs = filtered_probs / tf.reduce_sum(filtered_probs)
+        return filtered_probs
+
+    @staticmethod
+    def apply_top_p_probs(probs, p):
+        if p is None or p >= 1.0:
+            return probs
+        sorted_indices = tf.argsort(probs, direction='DESCENDING')
+        sorted_probs = tf.gather(probs, sorted_indices)
+        cumulative_probs = tf.cumsum(sorted_probs)
+        mask = cumulative_probs <= p
+        # Always keep at least 1 token
+        mask = tf.concat([tf.constant([True]), mask[1:]], axis=0)
+        keep_indices = tf.boolean_mask(sorted_indices, mask)
+        filtered_probs = tf.where(
+            tf.reduce_any(tf.equal(tf.range(tf.shape(probs)[0])[:, None], keep_indices), axis=1), probs,
+            tf.zeros_like(probs))
+        # Renormalize
+        filtered_probs = filtered_probs / tf.reduce_sum(filtered_probs)
+        return filtered_probs
+
+    def generate(self,
+                 token_ids,
+                 do_sample=False,
+                 max_new_tokens=None,
+                 temperature=1.0,
+                 top_k=None,
+                 top_p=None,
+                 frequency_penalty=None,
+                 presence_penalty=None,
+                 repetition_penalty=None):
+        """
+        Generate text autoregressively from token IDs.
+        Applies filtering in sequence: penalties -> temperature -> top-k -> top-p
+        """
+        # Convert token_ids to list if it's not already
+        if not isinstance(token_ids, list):
+            token_ids = list(token_ids)
+
+        # Determine the actual maximum number of new tokens
+        if max_new_tokens is None:
+            max_new_tokens = self.max_sequence_length - len(token_ids)
+        else:
+            max_new_tokens = min(max_new_tokens, self.max_sequence_length - len(token_ids))
+
+        # Initialize the generated tokens list
+        generated_tokens = []
+        current_tokens = token_ids.copy()
+
+        # Autoregressive generation loop
+        for _ in range(max_new_tokens):
+            # Pad or truncate to max_sequence_length
+            if len(current_tokens) > self.max_sequence_length:
+                input_tokens = current_tokens[-self.max_sequence_length:]
+            else:
+                padding_needed = self.max_sequence_length - len(current_tokens)
+                input_tokens = current_tokens + [self.padding_token] * padding_needed
+
+            # Convert to tensor and get model prediction
+            input_tensor = tf.constant([input_tokens], dtype=tf.int32)
+            probs_nested = self.model(input_tensor)
+            probs = probs_nested[0]  # Already softmax probabilities (NOT logits as comment says)
+            logits = tf.math.log(probs + 10 ** -20)  # Convert to logits for penalty application
+
+            if do_sample:
+                # Apply repetition/frequency/presence penalties to logits
+                if frequency_penalty is not None or presence_penalty is not None:
+                    # Collect token counts from current_tokens
+                    token_counts = {}
+                    for t in current_tokens:
+                        token_counts[t] = token_counts.get(t, 0) + 1
+
+                    # Prepare penalty tensor
+                    vocab_size = tf.shape(logits)[0]
+                    penalties = tf.zeros_like(logits)
+
+                    for token_id, count in token_counts.items():
+                        if token_id >= vocab_size:
+                            continue
+                        penalty = 0.0
+                        if presence_penalty is not None:
+                            penalty += presence_penalty
+                        if frequency_penalty is not None:
+                            penalty += frequency_penalty * count
+
+                        penalties = tf.tensor_scatter_nd_add(
+                            penalties,
+                            [[token_id]],
+                            [penalty]
+                        )
+
+                    # Subtract penalties from logits
+                    logits = logits - penalties
+
+                # Apply repetition penalty (standard approach)
+                if repetition_penalty is not None and repetition_penalty != 1.0:
+                    # Collect unique tokens that have appeared
+                    unique_tokens = list(set(current_tokens))
+                    vocab_size = tf.shape(logits)[0]
+
+                    for token_id in unique_tokens:
+                        if token_id < vocab_size:
+                            # Divide logits of repeated tokens by penalty
+                            logits = tf.tensor_scatter_nd_update(
+                                logits,
+                                [[token_id]],
+                                [logits[token_id] / repetition_penalty]
+                            )
+
+                # Apply temperature
+                if temperature != 1.0:
+                    logits = logits / temperature
+
+                # Convert to probabilities
+                probs = tf.nn.softmax(logits)
+
+                # Apply top-k filtering (if specified)
+                if top_k is not None and top_k > 0:
+                    k = min(top_k, tf.shape(probs)[0])
+                    # Get top-k values and indices
+                    top_k_values, top_k_indices = tf.nn.top_k(probs, k=k, sorted=False)
+                    # Create mask for top-k positions
+                    top_k_mask = tf.scatter_nd(
+                        tf.expand_dims(top_k_indices, 1),
+                        tf.ones_like(top_k_values, dtype=tf.bool),
+                        tf.shape(probs)
+                    )
+                    # Zero out non-top-k probabilities
+                    probs = tf.where(top_k_mask, probs, tf.zeros_like(probs))
+                    # Renormalize
+                    probs = probs / tf.reduce_sum(probs)
+                    print(
+                        f">>> After top_k: {tf.shape(probs)} shape, {tf.reduce_sum(tf.cast(probs > 1e-8, tf.int32))} non-zero probs")
+
+                # Apply top-p filtering (if specified)
+                if top_p is not None and top_p < 1.0:
+                    # Sort probabilities in descending order
+                    sorted_indices = tf.argsort(probs, direction='DESCENDING')
+                    sorted_probs = tf.gather(probs, sorted_indices)
+                    cumulative_probs = tf.cumsum(sorted_probs)
+                    # Create mask for top-p
+                    mask = cumulative_probs <= top_p
+                    # Always keep at least one token
+                    mask = tf.concat([tf.constant([True]), mask[1:]], axis=0)
+                    # Get indices to keep
+                    keep_indices = tf.boolean_mask(sorted_indices, mask)
+                    # Create mask for original indices
+                    filter_mask = tf.scatter_nd(
+                        tf.expand_dims(keep_indices, 1),
+                        tf.ones_like(keep_indices, dtype=tf.bool),
+                        tf.shape(probs)
+                    )
+                    # Apply mask and renormalize
+                    probs = tf.where(filter_mask, probs, tf.zeros_like(probs))
+                    probs = probs / tf.reduce_sum(probs)
+                    print(
+                        f">>> After top_p: {tf.shape(probs)} shape, {tf.reduce_sum(tf.cast(probs > 1e-8, tf.int32))} non-zero probs")
+
+                # Sample from the final filtered distribution
+                # Get non-zero indices and their probabilities
+                non_zero_mask = probs > 1e-8
+                if tf.reduce_any(non_zero_mask):
+                    filtered_indices = tf.where(non_zero_mask)[:, 0]  # Get indices
+                    filtered_probs = tf.boolean_mask(probs, non_zero_mask)  # Get probabilities
+                    # Sample
+                    sampled_local_index = tf.random.categorical(tf.math.log(filtered_probs)[None, :], 1)[0, 0]
+                    # Map back to vocabulary index
+                    next_token_id = int(filtered_indices[sampled_local_index].numpy())
+                else:
+                    # Fallback if all probabilities are zero
+                    warn(
+                        "Token sampling had to revert to greedy sampling, because no probs had a value > 0, unexpected")
+                    next_token_id = int(tf.argmax(probs, axis=-1).numpy())
+
+            else:
+                # Greedy sampling (argmax) - apply repetition penalty if needed
+                if repetition_penalty is not None and repetition_penalty != 1.0:
+                    unique_tokens = list(set(current_tokens))
+                    vocab_size = tf.shape(logits)[0]
+                    for token_id in unique_tokens:
+                        if token_id < vocab_size:
+                            logits = tf.tensor_scatter_nd_update(
+                                logits,
+                                [[token_id]],
+                                [logits[token_id] / repetition_penalty]
+                            )
+
+                next_token_id = int(tf.argmax(logits, axis=-1).numpy())
+
+            # Check for termination condition
+            if next_token_id == self.padding_token:
+                break
+
+            # Add to generated tokens and update current tokens
+            generated_tokens.append(int(next_token_id))
+            current_tokens.append(int(next_token_id))
+
+            # Check if we've reached max sequence length
+            if len(current_tokens) >= self.max_sequence_length:
+                break
+
+        return token_ids + generated_tokens
+