Skip to content

Commit 4246b45

Browse files
authored
feat: add ChatMLConvertor for chatml-jsonl format and enhance Executor
1 parent c37b7a2 commit 4246b45

File tree

3 files changed

+40
-1
lines changed

3 files changed

+40
-1
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
__pycache__/
2+
*.egg-info/

dingo/data/converter/base.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,43 @@ def find_levels_image(cls, data: json, levels: str) -> List:
4545
res = reduce(lambda x, y: x[y], levels.split('.'), data)
4646
return res if isinstance(res, List) else [res]
4747

48+
@BaseConverter.register("chatml-jsonl")
49+
class ChatMLConvertor(BaseConverter):
50+
"""
51+
ddm chatml file converter.
52+
"""
53+
54+
def __init__(self):
55+
super().__init__()
56+
57+
@classmethod
58+
def convertor(cls, input_args: InputArgs) -> Callable:
59+
def _convert(raw: Union[str, Dict]):
60+
j = raw
61+
if isinstance(raw, str):
62+
j = json.loads(raw)
63+
64+
dialogs: list = j["dialogs"]
65+
prompt = ""
66+
content = ""
67+
68+
for i in dialogs[:-1]:
69+
prompt += f"{i['role']:}\n\n"
70+
prompt += f"{i['content']}\n\n"
71+
72+
if len(dialogs) > 1:
73+
prompt += dialogs[-1]["role"]
74+
content += dialogs[-1]["content"]
75+
76+
return MetaData(**{
77+
'data_id': j["_id"],
78+
'prompt': prompt,
79+
'content': content,
80+
'raw_data': j
81+
})
82+
83+
return _convert
84+
4885

4986
@BaseConverter.register('json')
5087
class JsonConverter(BaseConverter):

dingo/exec/local.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def evaluate(self):
9494
group (Any): _description_
9595
group_type (str): _description_
9696
"""
97-
with concurrent.futures.ThreadPoolExecutor(max_workers=self.input_args.max_workers) as executor:
97+
with concurrent.futures.ProcessPoolExecutor(max_workers=self.input_args.max_workers) as executor:
9898
data_iter = self.load_data()
9999
data_iter = itertools.islice(data_iter, self.input_args.start_index, None)
100100

0 commit comments

Comments
 (0)