adding json validity test

SinclairHudson · SinclairHudson · commit c6c003247b9d · 2024-05-28T23:43:17.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@
 # experiment files
 */experiments
 */experiment
+experiment/*
 */archive
 */backup
 */baseline_results
@@ -49,4 +50,4 @@ venv.bak/
 
 # Coverage Report
 .coverage
-/htmlcov
+/htmlcov
diff --git a/examples/json_output_config.yml b/examples/json_output_config.yml
@@ -0,0 +1,77 @@
+save_dir: "./experiment/"
+
+ablation:
+  use_ablate: false
+
+# Data Ingestion -------------------
+data:
+  file_type: "huggingface" # one of 'json', 'csv', 'huggingface'
+  path: "azizshaw/text_to_json"
+  prompt:
+    >- # prompt, make sure column inputs are enclosed in {} brackets and that they match your data
+    {instruction}
+    Now create a json object for the following scenario
+    {input}
+  prompt_stub:
+    >- # Stub to add for training at the end of prompt, for test set or inference, this is omitted; make sure only one variable is present
+    {output}
+  test_size: 0.1 # Proportion of test as % of total; if integer then # of samples
+  train_size: 0.9 # Proportion of train as % of total; if integer then # of samples
+  train_test_split_seed: 42
+
+# Model Definition -------------------
+model:
+  hf_model_ckpt: "facebook/opt-125m"
+  torch_dtype: "bfloat16"
+  #attn_implementation: "flash_attention_2"
+  quantize: true
+  bitsandbytes:
+    load_in_4bit: true
+    bnb_4bit_compute_dtype: "bfloat16"
+    bnb_4bit_quant_type: "nf4"
+
+# LoRA Params -------------------
+lora:
+  task_type: "CAUSAL_LM"
+  r: 16
+  lora_dropout: 0.1
+  target_modules:
+    - q_proj
+    - v_proj
+    - k_proj
+    - o_proj
+    - up_proj
+    - down_proj
+    - gate_proj
+
+# Training -------------------
+training:
+  training_args:
+    num_train_epochs: 5
+    per_device_train_batch_size: 4
+    gradient_accumulation_steps: 4
+    gradient_checkpointing: True
+    optim: "paged_adamw_32bit"
+    logging_steps: 100
+    learning_rate: 2.0e-4
+    bf16: true # Set to true for mixed precision training on Newer GPUs
+    tf32: true
+    # fp16: false     # Set to true for mixed precision training on Older GPUs
+    max_grad_norm: 0.3
+    warmup_ratio: 0.03
+    lr_scheduler_type: "constant"
+  sft_args:
+    max_seq_length: 5000
+    # neftune_noise_alpha: None
+
+inference:
+  max_new_tokens: 1024
+  use_cache: True
+  do_sample: True
+  top_p: 0.9
+  temperature: 0.8
+
+qa:
+  llm_tests:
+    - json_valid
+    - jaccard_similarity
diff --git a/llmtune/qa/qa_tests.py b/llmtune/qa/qa_tests.py
@@ -8,10 +8,12 @@
 from nltk.tokenize import word_tokenize
 from rouge_score import rouge_scorer
 from transformers import DistilBertModel, DistilBertTokenizer
+from langchain.evaluation import JsonValidityEvaluator
 
 from llmtune.qa.generics import LLMQaTest
 
 
+json_validity_evaluator = JsonValidityEvaluator()
 model_name = "distilbert-base-uncased"
 tokenizer = DistilBertTokenizer.from_pretrained(model_name)
 model = DistilBertModel.from_pretrained(model_name)
@@ -119,6 +121,21 @@ def get_metric(self, prompt: str, ground_truth: str, model_prediction: str) -> U
         overlap_percentage = (len(common_words) / len(words_ground_truth)) * 100
         return float(overlap_percentage)
 
+@QaTestRegistry.register("json_valid")
+class JSONValidityTest(LLMQaTest):
+    """
+    Checks to see if valid json can be parsed from the model output, according
+    to langchain_core.utils.json.parse_json_markdown
+    The JSON can be wrapped in markdown and this test will still pass
+    """
+    @property
+    def test_name(self) -> str:
+        return "json_valid"
+
+    def get_metric(self, prompt: str, ground_truth: str, model_prediction: str) -> float:
+        result = json_validity_evaluator.evaluate_strings(prediction=model_prediction)
+        binary_res = result["score"]
+        return float(binary_res)
 
 class PosCompositionTest(LLMQaTest):
     def _get_pos_percent(self, text: str, pos_tags: List[str]) -> float:
diff --git a/tests/qa/test_qa_tests.py b/tests/qa/test_qa_tests.py
@@ -9,6 +9,7 @@
     RougeScoreTest,
     VerbPercent,
     WordOverlapTest,
+    JSONValidityTest
 )
 
 
@@ -23,6 +24,7 @@
         (VerbPercent, float),
         (AdjectivePercent, float),
         (NounPercent, float),
+        (JSONValidityTest, float),
     ],
 )
 def test_metric_return_type(test_class, expected_type):
@@ -84,3 +86,19 @@ def test_noun_percent():
     test = NounPercent()
     result = test.get_metric("prompt", "The cat", "The cat and the dog")
     assert result >= 0, "Noun percentage should be non-negative."
+
+@pytest.mark.parametrize(
+    "input_string,expected_value",
+    [
+        ('{"Answer": "The cat"}', 1),
+        ("{'Answer': 'The cat'}", 0), # Double quotes are required in json
+        ('{"Answer": "The cat",}', 0),
+        ('{"Answer": "The cat", "test": "case"}', 1),
+        ('```json\n{"Answer": "The cat"}\n```', 1),  # this json block can still be processed
+        ('Here is an example of a JSON block: {"Answer": "The cat"}', 0),
+    ],
+)
+def test_json_valid(input_string: str, expected_value: float):
+    test = JSONValidityTest()
+    result = test.get_metric("prompt", "The cat", input_string)
+    assert result == expected_value, f"JSON validity should be {expected_value} but got {result}."