Skip to content

Commit e52c773

Browse files
authored
Merge pull request #136 from georgian-io/jsonl-support
`jsonl` format support
2 parents ce09375 + c5d35d0 commit e52c773

File tree

2 files changed

+17
-2
lines changed

2 files changed

+17
-2
lines changed

llmtune/data/ingestor.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,14 @@
88
def get_ingestor(data_type: str):
99
if data_type == "json":
1010
return JsonIngestor
11+
elif data_type == "jsonl":
12+
return JsonlIngestor
1113
elif data_type == "csv":
1214
return CsvIngestor
1315
elif data_type == "huggingface":
1416
return HuggingfaceIngestor
1517
else:
16-
raise ValueError(f"'type' must be one of 'json', 'csv', or 'huggingface', you have {data_type}")
18+
raise ValueError(f"'type' must be one of 'json', 'jsonl', 'csv', or 'huggingface', you have {data_type}")
1719

1820

1921
class Ingestor(ABC):
@@ -35,6 +37,19 @@ def to_dataset(self) -> Dataset:
3537
return Dataset.from_generator(self._json_generator)
3638

3739

40+
class JsonlIngestor(Ingestor):
41+
def __init__(self, path: str):
42+
self.path = path
43+
44+
def _jsonl_generator(self):
45+
with open(self.path, "rb") as f:
46+
for item in ijson.items(f, "", multiple_values=True):
47+
yield item
48+
49+
def to_dataset(self) -> Dataset:
50+
return Dataset.from_generator(self._jsonl_generator)
51+
52+
3853
class CsvIngestor(Ingestor):
3954
def __init__(self, path: str):
4055
self.path = path

llmtune/pydantic_models/config_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class QaConfig(BaseModel):
1313

1414

1515
class DataConfig(BaseModel):
16-
file_type: Literal["json", "csv", "huggingface"] = Field(None, description="File type")
16+
file_type: Literal["json", "jsonl", "csv", "huggingface"] = Field(None, description="File type")
1717
path: Union[FilePath, HfModelPath] = Field(None, description="Path to the file or HuggingFace model")
1818
prompt: str = Field(None, description="Prompt for the model. Use {} brackets for column name")
1919
prompt_stub: str = Field(

0 commit comments

Comments
 (0)