chore: fix OpenAI GPT template

thewebscraping · thewebscraping · commit 8a64896eed4d · 2024-12-29T22:45:38.000+07:00
diff --git a/gemma_template/__version__.py b/gemma_template/__version__.py
@@ -3,5 +3,5 @@
 __url__ = "https://github.com/thewebscraping/gemma-template"
 __author__ = "Tu Pham"
 __author_email__ = "thetwofarm@gmail.com"
-__version__ = "0.1.2"
+__version__ = "0.1.3"
 __license__ = "Apache-2.0"
diff --git a/gemma_template/models.py b/gemma_template/models.py
@@ -2,6 +2,7 @@
 
 import asyncio
 import json
+from math import ceil
 from pathlib import Path
 from string import punctuation
 from typing import (Callable, ClassVar, Literal, Optional, Sequence, Union,
@@ -124,8 +125,6 @@ class StructureField(BaseTemplate):
         "description": [
             "Description",
             "Introduction",
-            "Summary",
-            "Intro",
             "Meta Description",
         ],
         "document": ["Article", "Edit Article"],
@@ -401,6 +400,7 @@ def load_dataset(
         min_chars_length: int = 2,
         max_chars_length: int = 0,
         max_concurrency: int = 4,
+        is_remove_data: bool = True,
         is_close_async_loop: bool = True,
         **kwargs,
     ) -> Union[Dataset, DatasetDict]:
@@ -436,6 +436,8 @@ def load_dataset(
                 Maximum character of a word, used to create unigrams, bigrams and trigrams. Default is 0.
             max_concurrency (int):
                 Maximum number of concurrent threads for processing data. Default is 4.
+            is_remove_data (bool):
+                True will remove the original data from the dataset, otherwise it will keep the field as `data` in the dataset.
             is_close_async_loop (bool):
                 By default it will close the asyncio event loop every time I finish processing the dataset data.
                 Although it has handled the `RuntimeError` exception. However, you should set it to False if running on Kaggle Notebooks and Colab.
@@ -470,14 +472,15 @@ def load_dataset(
             ```
         """  # noqa: E501
 
-        async def create_task(config, hidden_count: int = 0):
+        async def create_task(config, max_hidden_count: int = 0, hidden_count: int = 0):
             async with semaphore:
                 config.update(kwargs)
                 config.update(
                     dict(
                         min_chars_length=min_chars_length,
                         max_chars_length=max_chars_length,
                         excluded_fields=excluded_fields,
+                        is_remove_data=is_remove_data,
                     )
                 )
                 if max_hidden_ratio > 0 and hidden_count < max_hidden_count:
@@ -518,9 +521,10 @@ async def create_task(config, hidden_count: int = 0):
                 hidden_count += 1
 
         async def run_task(ds):
+            max_hidden_count = ceil(len(ds) * max_hidden_ratio)
             await asyncio.wait(
                 [
-                    loop.create_task(create_task(config, idx))
+                    loop.create_task(create_task(config, max_hidden_count, idx))
                     for idx, config in enumerate(ds)
                 ]
             )
@@ -558,8 +562,6 @@ def _close():
                     )
                 )
 
-        items = []
-        max_hidden_count = int(len(dataset) * max_hidden_ratio)
         try:
             loop = asyncio.get_running_loop()
         except RuntimeError:
@@ -580,7 +582,6 @@ def _close():
             with tqdm(total=len(dataset)) as pbar:
                 for field in dataset.column_names:
                     items = []
-                    max_hidden_count = int(len(dataset[field]) * max_hidden_ratio)
                     _ = loop.run_until_complete(run_task(dataset[field]))
                     mapping[field] = Dataset.from_list(items)
 
@@ -836,8 +837,7 @@ def generate_user_prompt(
 
     def generate_model_prompt(
         self,
-        structure_template: Optional[TemplateTypes] = None,
-        excluded_fields: Optional[Sequence[str]] = (),
+        structure_template: Optional[TemplateTypes] = "",
         bullet_style: Optional[Union[str, Literal["dash", "number"]]] = "dash",
         **kwargs,
     ) -> str:
@@ -849,7 +849,6 @@ def generate_model_prompt(
 
         Args:
             structure_template (Optional[Union[str, Callable]]): A structure template defining the generating structure prompt.
-            excluded_fields (Sequence[str]): Fields excluded to response. Default is empty sequence.
             bullet_style (Optional[str]): Bullet list style start dash or number. Default is dash.
             **kwargs: See also `Template.template`.
 
@@ -866,11 +865,6 @@ def generate_model_prompt(
         """  # noqa: E501
 
         output_document = kwargs.get("output", "")
-        if excluded_fields:
-            for excluded_field in excluded_fields:
-                if excluded_field in kwargs:
-                    kwargs.pop(excluded_field)
-
         if isinstance(structure_template, (str, Callable)):
             kwargs["document"] = output_document
             if isinstance(structure_template, Callable):
@@ -916,6 +910,7 @@ def to_text(
             language_code=user_kwargs.get("language_code", "auto"),
             language=user_kwargs.get("language"),
             is_masked=bool(user_kwargs.get("is_masked")),
+            data=self._get_origin_data(**kwargs),
         )
 
     def to_alpaca(
@@ -941,6 +936,7 @@ def to_alpaca(
             language_code=user_kwargs.get("language_code", "auto"),
             language=user_kwargs.get("language"),
             is_masked=bool(user_kwargs.get("is_masked")),
+            data=self._get_origin_data(**kwargs),
         )
 
     def to_openai(
@@ -955,8 +951,16 @@ def to_openai(
             user_template, instruction_template, structure_template, **kwargs
         )
         return dict(
-            human=user_template,
-            gpt=model_template,
+            conversations=[
+                {
+                    "from": "human",
+                    "value": user_template,
+                },
+                {
+                    "from": "gpt",
+                    "value": model_template,
+                },
+            ],
             is_instructed=bool(instruction_template is not None),
             is_structured=bool(structure_template is not None),
             unigrams=user_kwargs.get("unigrams", []) or [],
@@ -965,6 +969,7 @@ def to_openai(
             language_code=user_kwargs.get("language_code", "auto"),
             language=user_kwargs.get("language"),
             is_masked=bool(user_kwargs.get("is_masked")),
+            data=self._get_origin_data(**kwargs),
         )
 
     def _get_template(
@@ -1069,10 +1074,14 @@ def _ftm_template(word):
     def _formatting_structure_user_fn(
         self,
         structure_template: str = STRUCTURE_TEMPLATE,
+        excluded_fields: Sequence[str] = (),
         **kwargs,
     ) -> str:
         prompts = []
-        for _, data in self._get_structure_attrs(**kwargs).items():
+        for field, data in self._get_structure_attrs(**kwargs).items():
+            if excluded_fields and field in excluded_fields:
+                continue
+
             prompts.append(
                 "{field} {prompt}".format(
                     field=data["bold_value"], prompt=data["prompt"]
@@ -1085,6 +1094,7 @@ def _formatting_structure_model_fn(
         self,
         structure_data: dict,
         bullet_style: str = None,
+        excluded_fields: Sequence[str] = (),
         *args,
         **kwargs,
     ) -> str:
@@ -1097,6 +1107,9 @@ def _formatting_structure_model_fn(
             if field not in kwargs:
                 continue
 
+            if excluded_fields and field in excluded_fields:
+                continue
+
             value = kwargs[field]
             if not value:
                 continue
@@ -1129,6 +1142,11 @@ def _get_structure_attrs(self, **kwargs):
                 }
         return mapping
 
+    def _get_origin_data(self, **kwargs) -> dict:
+        if not kwargs.get("is_remove_data", True):
+            return {k: v for k, v in kwargs.items() if hasattr(self, k)}
+        return {}
+
 
 gemma_template = Template()
 vietnamese_gemma_template = Template(