feat: added byte and target token calculations for evaluation datasets, improved calculation logic for BPC/bPPL (#82)

FromCSUZhou · pre-commit-ci[bot] · nickcom007 · web-flow · commit 44ed51635fde · 2025-05-27T08:55:19.000-04:00
* feat: added byte and target token calculations for evaluation datasets, improved calculation logic for BPC/bPPL * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix: removed unused numpy imports to optimize code * feat: optimize the calculation logic code and logging function of BPC and bPPL * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * feat: simplify the relevant calculation logic and log code in the verification process * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix: add handling for loss calculation --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Nick <nickcom007@gmail.com>
diff --git a/src/core/log_utils.py b/src/core/log_utils.py
@@ -0,0 +1,56 @@
+import math
+from loguru import logger
+import numbers
+
+
+def _log_summary_table(
+    model_name_or_path,
+    eval_loss,
+    bpc_metrics,
+    token_byte_ratio,
+    total_target_tokens,
+    total_bytes,
+    vocab_size,
+    model_params_m,
+):
+    """Helper function to log summary table in vertical format."""
+
+    table_data = {
+        "Model Name": model_name_or_path,
+        "Token Loss (nats)": f"{eval_loss:.5f}"
+        if isinstance(eval_loss, numbers.Real) and not math.isnan(eval_loss)
+        else str(eval_loss),
+        "BPC": f"{bpc_metrics['bpc']:.5f}"
+        if not math.isinf(bpc_metrics["bpc"]) and not math.isnan(bpc_metrics["bpc"])
+        else str(bpc_metrics["bpc"]),
+        "bPPL": f"{bpc_metrics['bppl']:.5f}"
+        if not math.isinf(bpc_metrics["bppl"]) and not math.isnan(bpc_metrics["bppl"])
+        else str(bpc_metrics["bppl"]),
+        "T/B Ratio": f"{token_byte_ratio:.4f}"
+        if not math.isinf(token_byte_ratio) and not math.isnan(token_byte_ratio)
+        else str(token_byte_ratio),
+        "Target Tokens": str(total_target_tokens),
+        "Target Bytes": str(total_bytes),
+        "Vocab Size": str(vocab_size),
+        "Total Params (M)": f"{model_params_m:.2f}"
+        if isinstance(model_params_m, numbers.Real) and not math.isnan(model_params_m)
+        else str(model_params_m),
+    }
+
+    label_width = max(len(label) for label in table_data.keys())
+    value_width = max(len(str(value)) for value in table_data.values())
+    total_width = label_width + value_width + 3
+
+    header = (
+        "=" * ((total_width - 20) // 2)
+        + " Validation Summary "
+        + "=" * ((total_width - 20) // 2)
+    )
+    logger.info(f"\n{header}")
+
+    for label, value in table_data.items():
+        if label == "Model Name" and len(value) > value_width:
+            value = value[: value_width - 3] + "..."
+        print(f"{label:<{label_width}} | {value:<{value_width}}")
+
+    print("=" * total_width + "\n")
diff --git a/src/core/loss.py b/src/core/loss.py
@@ -0,0 +1,97 @@
+import math
+import numbers
+
+
+def calculate_bpc_bppl_metrics(eval_loss, total_target_tokens, total_bytes):
+    """
+    Calculates BPC (Bits Per Character) and bPPL (bits Per Character Perplexity).
+
+    Args:
+        eval_loss (float): Average token-level loss in nats.
+        total_target_tokens (int): Total number of target tokens.
+        total_bytes (int): Total number of target bytes.
+
+    Returns:
+        dict: A dictionary containing 'bpc', 'bppl', 'nll_token_nats_total',
+              'nll_token_bits_total'.
+              Returns values like {'bpc': float('inf'), 'bppl': float('inf'), ...}
+              if total_bytes is 0, eval_loss is invalid (non-real, NaN, or infinity).
+              'bppl' will also be float('inf') if bpc is float('inf') or if
+              math.pow(2, bpc) calculation overflows for a large finite bpc.
+    """
+    if (
+        total_bytes == 0
+        or not isinstance(eval_loss, numbers.Real)
+        or math.isnan(eval_loss)
+        or math.isinf(eval_loss)
+    ):
+        return {
+            "bpc": float("inf"),
+            "bppl": float("inf"),
+            "nll_token_nats_total": float("nan"),
+            "nll_token_bits_total": float("nan"),
+        }
+
+    nll_token_nats_total = eval_loss * total_target_tokens
+    nll_token_bits_total = nll_token_nats_total / math.log(2)
+    bpc = nll_token_bits_total / total_bytes
+
+    if math.isinf(bpc):
+        bppl = float("inf")
+    else:
+        try:
+            bppl = math.pow(2, bpc)
+        except OverflowError:
+            bppl = float("inf")
+
+    return {
+        "bpc": bpc,
+        "bppl": bppl,
+        "nll_token_nats_total": nll_token_nats_total,
+        "nll_token_bits_total": nll_token_bits_total,
+    }
+
+
+def get_token_byte_ratio(total_target_tokens, total_bytes):
+    """
+    Calculates the token to byte ratio.
+
+    Args:
+        total_target_tokens (int): Total number of target tokens.
+        total_bytes (int): Total number of target bytes.
+
+    Returns:
+        float: The token to byte ratio. Returns float('inf') if total_bytes is 0.
+    """
+    if total_bytes == 0:
+        return float("inf")
+    return total_target_tokens / total_bytes
+
+
+def calculate_bytes_and_tokens(eval_dataset, tokenizer, logger):
+    """
+    Calculates total bytes and target tokens in the evaluation dataset.
+
+    Args:
+        eval_dataset: The evaluation dataset.
+        tokenizer: The tokenizer.
+        logger: The logger instance.
+
+    Returns:
+        tuple: A tuple containing total_bytes and total_target_tokens.
+    """
+    total_bytes = 0
+    total_target_tokens = 0
+    logger.info(
+        "Calculating total bytes and target tokens in the evaluation dataset..."
+    )
+    for i in range(len(eval_dataset)):
+        item = eval_dataset[i]
+        target_ids = [
+            id for id, mask in zip(item["input_ids"], item["target_mask"]) if mask == 1
+        ]
+        if target_ids:
+            target_text = tokenizer.decode(target_ids, skip_special_tokens=True)
+            total_bytes += len(target_text.encode("utf-8"))
+            total_target_tokens += len(target_ids)
+    return total_bytes, total_target_tokens
diff --git a/src/validate.py b/src/validate.py
@@ -1,4 +1,5 @@
 import json
+import numbers
 import os
 import time
 import shutil
@@ -31,12 +32,19 @@
     handle_runtime_error,
     handle_value_error,
 )
+from core.loss import (
+    calculate_bpc_bppl_metrics,
+    get_token_byte_ratio,
+    calculate_bytes_and_tokens,
+)
+from core.log_utils import _log_summary_table
 from tenacity import retry, stop_after_attempt, wait_exponential
 from client.fed_ledger import FedLedger
 from peft import PeftModel
 import sys
 import math
 
+
 load_dotenv()
 TIME_SLEEP = int(os.getenv("TIME_SLEEP", 60 * 3))
 ASSIGNMENT_LOOKUP_INTERVAL = 60 * 3  # 3 minutes
@@ -292,6 +300,14 @@ def validate(
 
     model = None
     eval_dataset = None
+    bpc_metrics_results = {
+        "bpc": float("inf"),
+        "bppl": float("inf"),
+        "nll_token_nats_total": float("nan"),
+        "nll_token_bits_total": float("nan"),
+    }
+    token_byte_ratio_value = float("inf")
+    eval_loss = float("nan")  # Initialize eval_loss
 
     try:
         fed_ledger = FedLedger(FLOCK_API_KEY)
@@ -379,6 +395,28 @@ def validate(
         eval_dataset = load_sft_dataset(
             eval_file, context_length, template_name=base_model, tokenizer=tokenizer
         )
+
+        total_bytes, total_target_tokens = calculate_bytes_and_tokens(
+            eval_dataset, tokenizer, logger
+        )
+
+        if total_bytes == 0:
+            logger.warning(
+                "Total bytes in the evaluation dataset is 0. Cannot calculate BPC. Check dataset processing."
+            )
+            eval_loss_to_submit = LOSS_FOR_MODEL_PARAMS_EXCEED
+        else:
+            logger.info(f"Total target bytes (B): {total_bytes}")
+            logger.info(f"Total target tokens (T): {total_target_tokens}")
+            token_byte_ratio_value = get_token_byte_ratio(
+                total_target_tokens, total_bytes
+            )
+            logger.info(f"Token/Byte ratio (T/B): {token_byte_ratio_value:.4f}")
+            if token_byte_ratio_value < 0.1:
+                logger.warning(
+                    f"Token/Byte ratio ({token_byte_ratio_value:.4f}) is unusually low. Potential manipulation detected."
+                )
+
         model = load_model(
             model_name_or_path, lora_only, revision, val_args, cached_lora
         )
@@ -413,19 +451,60 @@ def validate(
             data_collator=data_collator,
         )
 
+        logger.info("Starting evaluation...")
         eval_result = trainer.evaluate()
         eval_loss = eval_result["eval_loss"]
-        logger.info("evaluate result is %s" % str(eval_result))
+
+        logger.info("Raw evaluation result: %s" % str(eval_result))
+
+        if total_bytes > 0:
+            bpc_metrics_results = calculate_bpc_bppl_metrics(
+                eval_loss, total_target_tokens, total_bytes
+            )
+
+        is_bpc_valid = not math.isinf(bpc_metrics_results["bpc"])
+
+        _log_summary_table(
+            model_name_or_path=model_name_or_path,
+            eval_loss=eval_loss,
+            bpc_metrics=bpc_metrics_results,
+            token_byte_ratio=token_byte_ratio_value,
+            total_target_tokens=total_target_tokens,
+            total_bytes=total_bytes,
+            vocab_size=tokenizer.vocab_size,
+            model_params_m=(sum(p.numel() for p in model.parameters()) / 1e6)
+            if model
+            else float("nan"),
+        )
+
         if local_test:
-            logger.info("The model can be correctly validated by validators.")
+            logger.info(
+                "The model can be correctly validated by validators (raw loss)."
+            )
+            if not is_bpc_valid:  # If BPC is inf
+                logger.warning(
+                    "Could not calculate BPC/bPPL for local test due to zero bytes or invalid loss."
+                )
             return
-        # sometimes the loss might not be a valid float
-        if isinstance(eval_loss, float) and (
-            math.isnan(eval_loss) or math.isinf(eval_loss)
-        ):
-            eval_loss = LOSS_FOR_MODEL_PARAMS_EXCEED
+
+        eval_loss_to_submit = LOSS_FOR_MODEL_PARAMS_EXCEED  # Default to high loss
+
+        if is_bpc_valid:
+            eval_loss_to_submit = bpc_metrics_results["bpc"]
+        else:
+            if total_bytes == 0:
+                logger.error("Total bytes is 0, submitting high loss.")
+            elif (
+                not isinstance(eval_loss, numbers.Real)
+                or math.isnan(eval_loss)
+                or math.isinf(eval_loss)
+            ):
+                logger.error(f"Invalid eval_loss ({eval_loss}), submitting high loss.")
+
         resp = fed_ledger.submit_validation_result(
-            assignment_id=assignment_id, loss=eval_loss, gpu_type=gpu_type
+            assignment_id=assignment_id,
+            loss=eval_loss_to_submit,  # Submit BPC as loss
+            gpu_type=gpu_type,
         )
         # check response is 200
         if resp.status_code != 200:
@@ -439,10 +518,9 @@ def validate(
                 fed_ledger.mark_assignment_as_failed(assignment_id)
             return
         logger.info(
-            f"Successfully submitted validation result for assignment {assignment_id}"
+            f"Successfully submitted validation result (BPC: {eval_loss_to_submit}) for assignment {assignment_id}"
         )
 
-    # raise for exceptions, will handle at `loop` level
     except Exception as e:
         raise e
     finally:
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -0,0 +1 @@
+# Tests package for llm-loss-validator
diff --git a/tests/core/__init__.py b/tests/core/__init__.py
@@ -0,0 +1 @@
+# Core module tests for llm-loss-validator
diff --git a/tests/core/test_loss.py b/tests/core/test_loss.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# Core module tests for llm-loss-validator`