From 9438de3c8ca40f0871561e4e2e2827dedd07d052 Mon Sep 17 00:00:00 2001
From: Kaushik Chatterjee <kchatr1729@gmail.com>
Date: Sun, 26 May 2024 23:19:44 -0400
Subject: [PATCH 1/9] Created README.md

---
 json-generation/README.md | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 json-generation/README.md

diff --git a/json-generation/README.md b/json-generation/README.md
new file mode 100644
index 000000000..feb72db2d
--- /dev/null
+++ b/json-generation/README.md
@@ -0,0 +1,7 @@
+This is an implementation of a JSON-mode for small LLMs, using a combination of a fine-tuned Mistral 7B, Hermes 2 Pro, and Jsonformers. 
+
+Hermes 2 Pro is finetuned from Mistral's 7b-v0.1 model, incorporating a newly developed Function Calling and JSON Mode dataset provided by Nous Research. As a result, Hermes is finetuned to better perform for both function calling as well as general structured data tasks. It was decided to go with the Hermes 2 Pro model over the base Mistral 7B due to its fantastic performance on structured JSON Output, achieving 84% on the evaluation created in partnership with Fireworks.AI. More information about the model and its development can be found on its HuggingFace card: https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B
+
+In order to further mitigate the risk of hallucination, we use the open-source library Jsonformer (https://github.com/1rgs/jsonformer/?tab=readme-ov-file). Jsonformer is a wrapper around Hugging Face models that fill in the _fixed_ tokens during the generation process, delegating only the task of generating the content tokens to the language model. As a result, the generated JSON will always be syntatically correct (as there is no opportunity for hallucinations thanks to the separation of concerns) with a high overall efficiency as only the content tokens need to be generated, not an entire JSON string. By wrapping Hermes with Jsonformer, we hope to prevent any possibility of malformed or invalid JSON structure while increasing model performance and speed on content token generation.
+
+The modifications I made to the Model class structure are the addition of a `schema` parameter, to allow the user to specify the desired JSON schema for generation, as well as adding a `latency_metrics` dictionary which records various metrics related to the latency of the model, namely prefill time, time to first token, time per output token, and total generation time.
\ No newline at end of file

From 1eb9ad4c0406540c759713b4b279d5e4283156e1 Mon Sep 17 00:00:00 2001
From: Kaushik Chatterjee <kchatr1729@gmail.com>
Date: Sun, 26 May 2024 23:20:04 -0400
Subject: [PATCH 2/9] Created Config

---
 json-generation/config.yaml | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 json-generation/config.yaml

diff --git a/json-generation/config.yaml b/json-generation/config.yaml
new file mode 100644
index 000000000..e81399416
--- /dev/null
+++ b/json-generation/config.yaml
@@ -0,0 +1,34 @@
+description: Mistral 7B, optimized for chat! Compatible with OpenAI Client
+environment_variables: {}
+external_package_dirs: []
+model_cache:
+- allow_patterns:
+  - '*.json'
+  - '*.safetensors'
+  - '*.model'
+  repo_id: NousResearch/Hermes-2-Pro-Mistral-7B
+model_metadata:
+  example_model_input:
+    messages:
+    - content: What is the mistral wind?
+      role: user
+  model: NousResearch/Hermes-2-Pro-Mistral-7B
+  repo_id: NousResearch/Hermes-2-Pro-Mistral-7B
+  pretty_name: Hermes 2 Pro - Mistral 7B
+  tags:
+  - text-generation
+  - openai-compatible
+model_name: Hermes 2 Pro - Mistral 7B
+python_version: py311
+requirements:
+- accelerate
+- transformers
+- torch
+- jsonformer
+resources:
+  accelerator: A10G
+  memory: 25Gi
+  use_gpu: true
+secrets:
+  hf_access_token: "ENTER HF ACCESS TOKEN HERE"
+system_packages: []
\ No newline at end of file

From ed92b800fe8fd8bf8c4451a28ece26f29f826e45 Mon Sep 17 00:00:00 2001
From: Kaushik Chatterjee <kchatr1729@gmail.com>
Date: Sun, 26 May 2024 23:34:44 -0400
Subject: [PATCH 3/9] Created Truss for JSON Generation

---
 json-generation/model/__init__.py |   0
 json-generation/model/model.py    | 127 ++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 json-generation/model/__init__.py
 create mode 100644 json-generation/model/model.py

diff --git a/json-generation/model/__init__.py b/json-generation/model/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/json-generation/model/model.py b/json-generation/model/model.py
new file mode 100644
index 000000000..39f0c426b
--- /dev/null
+++ b/json-generation/model/model.py
@@ -0,0 +1,127 @@
+import os
+import time
+from threading import Thread
+
+import torch
+from transformers import GenerationConfig, TextIteratorStreamer, pipeline
+import jsonformer
+
+class Model:
+    def __init__(self, schema, **kwargs):
+        self._repo_id = "NousResearch/Hermes-2-Pro-Mistral-7B"
+        self._hf_access_token = kwargs["secrets"]["hf_access_token"]
+        self._schema = schema
+        self._latency_metrics = dict()
+        self._model = None
+        self._jsonformer = jsonformer.Jsonformer(model=self._model, tokenizer=self._model.tokenizer, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+
+    def get_latency_metrics(self):
+        return self._latency_metrics
+
+    def load(self):
+        self._model = pipeline(
+            "text-generation",
+            model=self._repo_id,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            token=self._hf_access_token,
+        )
+
+    def preprocess(self, request: dict):
+        generate_args = {
+            "max_new_tokens": 512,
+            "temperature": 1.0,
+            "top_p": 0.95,
+            "top_k": 50,
+            "repetition_penalty": 1.0,
+            "no_repeat_ngram_size": 0,
+            "use_cache": True,
+            "do_sample": True,
+            "eos_token_id": self._model.tokenizer.eos_token_id,
+            "pad_token_id": self._model.tokenizer.pad_token_id,
+            "return_full_text": False,
+        }
+
+        request["generate_args"] = {
+            k: request.get(k, generate_args[k])
+            for k in generate_args.keys()
+        }
+
+        return request
+
+    def stream(self, text_inputs: list, generation_args: dict):
+        streamer = TextIteratorStreamer(self._model.tokenizer)
+        generation_config = GenerationConfig(**generation_args)
+        generation_kwargs = {
+            "text_inputs": text_inputs,
+            "generation_config": generation_config,
+            "return_dict_in_generate": True,
+            "output_scores": True,
+            "max_new_tokens": generation_args["max_new_tokens"],
+            "streamer": streamer,
+        }
+
+        with torch.no_grad():
+            # Begin generation in a separate thread
+            thread = Thread(target=self._model, kwargs=generation_kwargs)
+            thread.start()
+
+            # Yield generated text as it becomes available
+            def inner():
+                for text in streamer:
+                    yield text
+                thread.join()
+
+        return inner()
+
+    def predict(self, request: dict):
+        start_time = time.time()
+        prefill_start = time.time()
+        model_inputs = self._model.tokenizer.apply_chat_template(messages, ...)
+        prefill_end = time.time()
+        prefill_time = prefill_end - prefill_start
+
+        stream = request.pop("stream", False)
+        messages = request.pop("messages")
+
+        # Create template for JSON generation
+        system_prompt = f"""<|im_start|>system
+You are a helpful assistant that answers in JSON. Here's the json schema you must adhere to:\n<schema>\n{self._schema}\n</schema><|im_end|>"""
+        
+        chat_template = system_prompt + "\n"
+        chat_template += "{% for message in messages %}"
+        chat_template += "<|im_start|>{{ message['role'] }}\n{{ message['content'] }}<|im_end|>\n"
+        chat_template += "{% endfor %}"
+        chat_template += "{% if add_generation_prompt is not defined %}{% set add_generation_prompt = false %}{% endif %}"
+        chat_template += "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+        
+        model_inputs = self._model.tokenizer.apply_chat_template(
+            messages, chat_template=chat_template, tokenize=False, add_generation_prompt=True
+        )
+        generation_args = request.pop("generate_args")
+
+        generation_start = time.time() 
+        
+        if stream:
+            return self.stream(model_inputs, generation_args)
+
+        with torch.no_grad():
+            results = self._jsonformer(text_inputs=model_inputs, **generation_args)
+            
+        first_token_time = time.time() - generation_start
+        total_tokens = len(results.split())
+        total_time = time.time() - start_time
+        tpot = (total_time - first_token_time) / total_tokens if total_tokens > 0 else 0
+        
+        self._latency_metrics = {
+            "prefill_time": prefill_time,
+            "time_to_first_token": first_token_time,
+            "time_per_output_token": tpot,
+            "total_generation_time": total_time,
+        }   
+
+
+        if len(results) > 0:
+            return results[0].get("generated_text")
+
+        raise Exception("No results returned from model")
\ No newline at end of file

From 45ab918d84d35e0b8556c9faae0e7f9e9d36d424 Mon Sep 17 00:00:00 2001
From: Kaushik Chatterjee <kchatr1729@gmail.com>
Date: Sun, 26 May 2024 23:41:26 -0400
Subject: [PATCH 4/9] Updated README

---
 json-generation/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/json-generation/README.md b/json-generation/README.md
index feb72db2d..c041e6fe4 100644
--- a/json-generation/README.md
+++ b/json-generation/README.md
@@ -4,4 +4,6 @@ Hermes 2 Pro is finetuned from Mistral's 7b-v0.1 model, incorporating a newly de
 
 In order to further mitigate the risk of hallucination, we use the open-source library Jsonformer (https://github.com/1rgs/jsonformer/?tab=readme-ov-file). Jsonformer is a wrapper around Hugging Face models that fill in the _fixed_ tokens during the generation process, delegating only the task of generating the content tokens to the language model. As a result, the generated JSON will always be syntatically correct (as there is no opportunity for hallucinations thanks to the separation of concerns) with a high overall efficiency as only the content tokens need to be generated, not an entire JSON string. By wrapping Hermes with Jsonformer, we hope to prevent any possibility of malformed or invalid JSON structure while increasing model performance and speed on content token generation.
 
-The modifications I made to the Model class structure are the addition of a `schema` parameter, to allow the user to specify the desired JSON schema for generation, as well as adding a `latency_metrics` dictionary which records various metrics related to the latency of the model, namely prefill time, time to first token, time per output token, and total generation time.
\ No newline at end of file
+The modifications I made to the Model class structure are the addition of a `schema` parameter, to allow the user to specify the desired JSON schema for generation, as well as adding a `latency_metrics` dictionary which records various metrics related to the latency of the model, namely prefill time, time to first token, time per output token, and total generation time.
+
+Although the model curently uses an LLM finetuned for the task of constrained decoding, due to wrapping the model in Jsonformer, it is possible to switch between various models for domain-specifc tasks (e.g. a JSON of medical information). As such, it should be quite easy to generalize, with the default model selected to optimize performance across a broad set of domains. 
\ No newline at end of file

From 2acda757fba047330739c656fb343686acfa0033 Mon Sep 17 00:00:00 2001
From: Kaushik Chatterjee <kchatr1729@gmail.com>
Date: Tue, 28 May 2024 00:08:57 -0400
Subject: [PATCH 5/9] Added sentencepiece to Requirements

---
 json-generation/config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/json-generation/config.yaml b/json-generation/config.yaml
index e81399416..30f5838d7 100644
--- a/json-generation/config.yaml
+++ b/json-generation/config.yaml
@@ -24,6 +24,7 @@ requirements:
 - accelerate
 - transformers
 - torch
+- sentencepiece
 - jsonformer
 resources:
   accelerator: A10G

From 60aeb793a83de38a1368ef2d5d8fb2cc4ea12e70 Mon Sep 17 00:00:00 2001
From: Kaushik Chatterjee <kchatr1729@gmail.com>
Date: Tue, 28 May 2024 00:09:21 -0400
Subject: [PATCH 6/9] Minor Refactoring

---
 json-generation/model/model.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/json-generation/model/model.py b/json-generation/model/model.py
index 39f0c426b..2fdbde7bb 100644
--- a/json-generation/model/model.py
+++ b/json-generation/model/model.py
@@ -7,13 +7,12 @@
 import jsonformer
 
 class Model:
-    def __init__(self, schema, **kwargs):
+    def __init__(self, **kwargs):
         self._repo_id = "NousResearch/Hermes-2-Pro-Mistral-7B"
         self._hf_access_token = kwargs["secrets"]["hf_access_token"]
-        self._schema = schema
         self._latency_metrics = dict()
         self._model = None
-        self._jsonformer = jsonformer.Jsonformer(model=self._model, tokenizer=self._model.tokenizer, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+        self._jsonformer = None
 
     def get_latency_metrics(self):
         return self._latency_metrics
@@ -27,6 +26,9 @@ def load(self):
             token=self._hf_access_token,
         )
 
+        self._jsonformer = jsonformer.Jsonformer(model=self._model, tokenizer=self._model.tokenizer, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+
+
     def preprocess(self, request: dict):
         generate_args = {
             "max_new_tokens": 512,
@@ -74,7 +76,7 @@ def inner():
 
         return inner()
 
-    def predict(self, request: dict):
+    def predict(self, schema: str, request: dict):
         start_time = time.time()
         prefill_start = time.time()
         model_inputs = self._model.tokenizer.apply_chat_template(messages, ...)
@@ -86,7 +88,7 @@ def predict(self, request: dict):
 
         # Create template for JSON generation
         system_prompt = f"""<|im_start|>system
-You are a helpful assistant that answers in JSON. Here's the json schema you must adhere to:\n<schema>\n{self._schema}\n</schema><|im_end|>"""
+You are a helpful assistant that answers in JSON. Here's the json schema you must adhere to:\n<schema>\n{schema}\n</schema><|im_end|>"""
         
         chat_template = system_prompt + "\n"
         chat_template += "{% for message in messages %}"

From c687456daaeb8a6a20397e7c3e954bda77e3de83 Mon Sep 17 00:00:00 2001
From: Kaushik Chatterjee <kchatr1729@gmail.com>
Date: Tue, 28 May 2024 19:26:54 -0400
Subject: [PATCH 7/9] Added Evaluation Results to README

---
 json-generation/README.md | 156 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 155 insertions(+), 1 deletion(-)

diff --git a/json-generation/README.md b/json-generation/README.md
index c041e6fe4..55170b4ad 100644
--- a/json-generation/README.md
+++ b/json-generation/README.md
@@ -6,4 +6,158 @@ In order to further mitigate the risk of hallucination, we use the open-source l
 
 The modifications I made to the Model class structure are the addition of a `schema` parameter, to allow the user to specify the desired JSON schema for generation, as well as adding a `latency_metrics` dictionary which records various metrics related to the latency of the model, namely prefill time, time to first token, time per output token, and total generation time.
 
-Although the model curently uses an LLM finetuned for the task of constrained decoding, due to wrapping the model in Jsonformer, it is possible to switch between various models for domain-specifc tasks (e.g. a JSON of medical information). As such, it should be quite easy to generalize, with the default model selected to optimize performance across a broad set of domains. 
\ No newline at end of file
+Although the model curently uses an LLM finetuned for the task of constrained decoding, due to wrapping the model in Jsonformer, it is possible to switch between various models for domain-specifc tasks (e.g. a JSON of medical information). As such, it should be quite easy to generalize, with the default model selected to optimize performance across a broad set of domains. 
+
+A preliminary assessment of this model against the baseline model, Mistral-7B-v0.1, showed immensely promising results. Given the following schema, 
+```json
+car = {
+  "type": "object",
+  "properties": {
+    "car": {
+      "type": "object",
+      "properties": {
+        "make": {"type": "string"},
+        "model": {"type": "string"},
+        "year": {"type": "number"},
+        "colors": {
+          "type": "array",
+          "items": {"type": "string"}
+        },
+        "features": {
+          "type": "object",
+          "properties": {
+            "audio": {
+              "type": "object",
+              "properties": {
+                "brand": {"type": "string"},
+                "speakers": {"type": "number"},
+                "hasBluetooth": {"type": "boolean"}
+              }
+            },
+            "safety": {
+              "type": "object",
+              "properties": {
+                "airbags": {"type": "number"},
+                "parkingSensors": {"type": "boolean"},
+                "laneAssist": {"type": "boolean"}
+              }
+            },
+            "performance": {
+              "type": "object",
+              "properties": {
+                "engine": {"type": "string"},
+                "horsepower": {"type": "number"},
+                "topSpeed": {"type": "number"}
+              }
+            }
+          }
+        }
+      }
+    },
+    "owner": {
+      "type": "object",
+      "properties": {
+        "firstName": {"type": "string"},
+        "lastName": {"type": "string"},
+        "age": {"type": "number"},
+      }
+    }
+  }
+}
+```
+the models were asked to generate an example car. The Hermes-2-Pro model + Jsonformer were able to successfully generate an example in __1min 4s ± 267 ms per loop__ (mean ± std. dev. of 7 runs, 1 loop each):
+```json
+{
+  car: {
+    make: "Toyota",
+    model: "Corolla",
+    year: 2020.5,
+    colors: [
+      "white",
+      "silver",
+      "gray",
+      "blue",
+      "black",
+      "red",
+      "green",
+      "yellow",
+      "orange",
+      "purple"
+    ],
+    features: {
+      audio: {
+        brand: "JBL",
+        speakers: 12.123,
+        hasBluetooth: True
+      },
+      safety: {
+        airbags: 7.8989,
+        parkingSensors: True,
+        laneAssist: True
+      },
+      performance: {
+        engine: "4-Cylinder Turbocharged E",
+        horsepower: 184.42,
+        topSpeed: 145.02
+      }
+    }
+  },
+  owner: {
+    firstName: "John",
+    lastName: "Doe",
+    age: 38.456
+  }
+}
+```
+
+Mistral, on the other hand, was unable to successfully generate an example (instead creating a false accident report) and took **3min 18s ± 75.3 ms per loop** (mean ± std. dev. of 7 runs, 1 loop each):
+
+```
+Car Accident Report
+
+Date: [Insert Date]
+Time: [Insert Time]
+Location: [Insert Address]
+
+Driver 1:
+Name: [Insert Name]
+Age: [Insert Age]
+Gender: [Insert Gender]
+Address: [Insert Address]
+Phone: [Insert Phone Number]
+
+Driver 2:
+Name: [Insert Name]
+Age: [Insert Age]
+Gender: [Insert Gender]
+Address: [Insert Address]
+Phone: [Insert Phone Number]
+
+Vehicle 1:
+Make: [Insert Make]
+Model: [Insert Model]
+Year: [Insert Year]
+Color: [Insert Color]
+License Plate Number: [Insert License Plate Number]
+
+Vehicle 2:
+Make: [Insert Make]
+Model: [Insert Model]
+Year: [Insert Year]
+Color: [Insert Color]
+License Plate Number: [Insert License Plate Number]
+
+Accident Summary:
+
+On [Insert Date] at [Insert Time], a car accident occurred at [Insert Address]. The accident involved two vehicles, a [Insert Make] [Insert Model] [Insert Year] [Insert Color] with license plate number [Insert License Plate Number], driven by [Insert Name], and a [Insert Make] [Insert Model] [Insert Year] [Insert Color] with license plate number [Insert License Plate Number], driven by [Insert Name].
+
+The accident occurred when Driver 1, who was traveling northbound on [Insert Road], failed to yield the right of way to Driver 2, who was traveling eastbound on [Insert Road]. The two vehicles collided at the intersection of [Insert Road] and [Insert Road], causing damage to both vehicles.
+
+There were no injuries reported as a result of the accident.
+
+Witnesses to the accident include [Insert Witness 1 Name], [Insert Witness 2 Name], and [Insert Witness 3 Name].
+
+The investigation into the accident is ongoing.
+```
+
+This model is both more accurate as well as efficient when compared to its base, as a result both of the fine-tuning, allowing the model to more effectively handle and understand JSON, as well as the constrained decoding methodology of Jsonformer which allowed for a separation of concerns between schema and output.
\ No newline at end of file

From 59513ea733baadcad1fed69431db6b329ffccf4a Mon Sep 17 00:00:00 2001
From: Kaushik Chatterjee <kchatr1729@gmail.com>
Date: Wed, 29 May 2024 09:58:36 -0400
Subject: [PATCH 8/9] Added Protobuf to Requirements

---
 json-generation/config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/json-generation/config.yaml b/json-generation/config.yaml
index 30f5838d7..000011e5f 100644
--- a/json-generation/config.yaml
+++ b/json-generation/config.yaml
@@ -25,6 +25,7 @@ requirements:
 - transformers
 - torch
 - sentencepiece
+- protobuf
 - jsonformer
 resources:
   accelerator: A10G

From 6553aad7e8d5b57d3a166427d26d9c7dcc8cdbb4 Mon Sep 17 00:00:00 2001
From: Kaushik Chatterjee <kchatr1729@gmail.com>
Date: Wed, 29 May 2024 10:01:01 -0400
Subject: [PATCH 9/9] Fixed JSONformer Constructor Call

---
 json-generation/model/model.py | 38 ++++++++--------------------------
 1 file changed, 9 insertions(+), 29 deletions(-)

diff --git a/json-generation/model/model.py b/json-generation/model/model.py
index 2fdbde7bb..36e7af317 100644
--- a/json-generation/model/model.py
+++ b/json-generation/model/model.py
@@ -4,15 +4,14 @@
 
 import torch
 from transformers import GenerationConfig, TextIteratorStreamer, pipeline
-import jsonformer
+from jsonformer.format import highlight_values
+from jsonformer.main import Jsonformer
 
 class Model:
     def __init__(self, **kwargs):
         self._repo_id = "NousResearch/Hermes-2-Pro-Mistral-7B"
         self._hf_access_token = kwargs["secrets"]["hf_access_token"]
         self._latency_metrics = dict()
-        self._model = None
-        self._jsonformer = None
 
     def get_latency_metrics(self):
         return self._latency_metrics
@@ -26,8 +25,6 @@ def load(self):
             token=self._hf_access_token,
         )
 
-        self._jsonformer = jsonformer.Jsonformer(model=self._model, tokenizer=self._model.tokenizer, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-
 
     def preprocess(self, request: dict):
         generate_args = {
@@ -76,7 +73,7 @@ def inner():
 
         return inner()
 
-    def predict(self, schema: str, request: dict):
+    def predict(self, schema: str, request: dict, prompt:str="Generate an example for the provided schema"):
         start_time = time.time()
         prefill_start = time.time()
         model_inputs = self._model.tokenizer.apply_chat_template(messages, ...)
@@ -86,32 +83,15 @@ def predict(self, schema: str, request: dict):
         stream = request.pop("stream", False)
         messages = request.pop("messages")
 
-        # Create template for JSON generation
-        system_prompt = f"""<|im_start|>system
-You are a helpful assistant that answers in JSON. Here's the json schema you must adhere to:\n<schema>\n{schema}\n</schema><|im_end|>"""
-        
-        chat_template = system_prompt + "\n"
-        chat_template += "{% for message in messages %}"
-        chat_template += "<|im_start|>{{ message['role'] }}\n{{ message['content'] }}<|im_end|>\n"
-        chat_template += "{% endfor %}"
-        chat_template += "{% if add_generation_prompt is not defined %}{% set add_generation_prompt = false %}{% endif %}"
-        chat_template += "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
-        
-        model_inputs = self._model.tokenizer.apply_chat_template(
-            messages, chat_template=chat_template, tokenize=False, add_generation_prompt=True
-        )
         generation_args = request.pop("generate_args")
+        jsonformer = jsonformer.Jsonformer(model=self._model, tokenizer=self._model.tokenizer, json_schema=schema, prompt=prompt)
 
         generation_start = time.time() 
-        
-        if stream:
-            return self.stream(model_inputs, generation_args)
 
-        with torch.no_grad():
-            results = self._jsonformer(text_inputs=model_inputs, **generation_args)
-            
+        output = jsonformer()  
+        
         first_token_time = time.time() - generation_start
-        total_tokens = len(results.split())
+        total_tokens = len(output.split())
         total_time = time.time() - start_time
         tpot = (total_time - first_token_time) / total_tokens if total_tokens > 0 else 0
         
@@ -123,7 +103,7 @@ def predict(self, schema: str, request: dict):
         }   
 
 
-        if len(results) > 0:
-            return results[0].get("generated_text")
+        if len(output) > 0:
+            return output
 
         raise Exception("No results returned from model")
\ No newline at end of file