daVinci-Dev/env_traj_utils/convert_trajectories.py at main · samebae-cohere/daVinci-Dev · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#!/usr/bin/env python3
"""
Convert GLM-4.6 SWE-agent trajectories to XML function calling format for Qwen SFT.

Source format (GLM-4.6):
- assistant messages have: content, reasoning_content, tool_calls (JSON format)
- tool messages have: content (tool response), tool_call_id

Target format (XML function calling):
- assistant messages: reasoning + content + XML tool call
- tool responses become user messages with "OBSERVATION:\n" prefix
"""

import json
import argparse
from pathlib import Path
from typing import Any

# XML System prompt from agent_system_prompt.md
SYSTEM_PROMPT = """You are a helpful assistant that can interact with a computer to solve tasks.
<IMPORTANT>
* If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it.
</IMPORTANT>

You have access to the following functions:

---- BEGIN FUNCTION #1: bash ----
Description: Execute a bash command in the terminal.

Parameters:
(1) command (string, required): The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process.
---- END FUNCTION #1 ----

---- BEGIN FUNCTION #2: submit ----
Description: Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task.
No parameters are required for this function.
---- END FUNCTION #2 ----

---- BEGIN FUNCTION #3: str_replace_editor ----
Description: Custom editing tool for viewing, creating and editing files
* State is persistent across command calls and discussions with the user
* If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep
* The `create` command cannot be used if the specified `path` already exists as a file
* If a `command` generates a long output, it will be truncated and marked with `<response clipped>`
* The `undo_edit` command will revert the last edit made to the file at `path`

Notes for using the `str_replace` command:
* The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!
* If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique
* The `new_str` parameter should contain the edited lines that should replace the `old_str`

Parameters:
(1) command (string, required): The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.
Allowed values: [`view`, `create`, `str_replace`, `insert`, `undo_edit`]
(2) path (string, required): Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.
(3) file_text (string, optional): Required parameter of `create` command, with the content of the file to be created.
(4) old_str (string, optional): Required parameter of `str_replace` command containing the string in `path` to replace.
(5) new_str (string, optional): Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.
(6) insert_line (integer, optional): Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.
(7) view_range (array, optional): Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.
---- END FUNCTION #3 ----


If you choose to call a function ONLY reply in the following format with NO suffix:

Provide any reasoning for the function call here.
<function=example_function_name>
<parameter=example_parameter_1>value_1</parameter>
<parameter=example_parameter_2>
This is the value for the second parameter
that can span
multiple lines
</parameter>
</function>

<IMPORTANT>
Reminder:
- Function calls MUST follow the specified format, start with <function= and end with </function>
- Required parameters MUST be specified
- Only call one function at a time
- Always provide reasoning for your function call in natural language BEFORE the function call (not after)
</IMPORTANT>"""


def convert_tool_call_to_xml(tool_call: dict) -> str:
    """Convert a JSON tool call to XML format.

    Input format:
    {
        "function": {
            "name": "str_replace_editor",
            "arguments": '{"command": "view", "path": "/testbed"}'
        },
        "id": "call_xxx",
        "type": "function"
    }

    Output format:
    <function=str_replace_editor>
    <parameter=command>view</parameter>
    <parameter=path>/testbed</parameter>
    </function>
    """
    func_name = tool_call["function"]["name"]
    args_str = tool_call["function"]["arguments"]

    try:
        args = json.loads(args_str)
    except json.JSONDecodeError:
        # If arguments is not valid JSON, treat it as a single command parameter
        args = {"command": args_str}

    # Build XML
    xml_parts = [f"<function={func_name}>"]
    for param_name, param_value in args.items():
        # Handle different value types
        if isinstance(param_value, (list, dict)):
            param_str = json.dumps(param_value)
        else:
            param_str = str(param_value)
        xml_parts.append(f"<parameter={param_name}>{param_str}</parameter>")
    xml_parts.append("</function>")

    return "\n".join(xml_parts)


def convert_message(msg: dict, is_first_system: bool = False) -> dict | None:
    """Convert a single message to the target format.

    Returns None if the message should be skipped.
    """
    role = msg.get("role")

    if role == "system":
        if is_first_system:
            # Replace system prompt with XML version
            return {"role": "system", "content": SYSTEM_PROMPT}
        else:
            # Skip additional system messages
            return None

    elif role == "user":
        # Keep user messages as is (just content)
        return {"role": "user", "content": msg.get("content", "")}

    elif role == "assistant":
        # Combine reasoning_content + content + XML tool call
        parts = []

        # Add reasoning content if present
        reasoning = msg.get("reasoning_content")
        if reasoning:
            parts.append("<think>" + reasoning + "</think>")

        # Add content if present
        content = msg.get("content")
        if content:
            parts.append(content)

        # Convert tool calls to XML
        tool_calls = msg.get("tool_calls", [])
        assert len(tool_calls) <= 1, "Expected at most one tool call per assistant message"
        for tc in tool_calls:
            xml_call = convert_tool_call_to_xml(tc)
            parts.append(xml_call)

        combined_content = "\n\n".join(parts) if parts else ""
        return {"role": "assistant", "content": combined_content}

    elif role == "tool":
        # Convert tool response to user message with OBSERVATION prefix
        content = msg.get("content", "")
        # # Check if content already starts with OBSERVATION
        # if not content.startswith("OBSERVATION:"):
        #     content = f"OBSERVATION:\n{content}"
        return {"role": "user", "content": content}

    else:
        # Unknown role, skip
        print(f"Warning: Unknown role '{role}', skipping message")
        return None


def convert_trajectory(data: dict) -> dict:
    """Convert a single trajectory from GLM format to XML format."""
    messages = data.get("messages", [])
    converted_messages = []

    seen_system = False
    for msg in messages:
        is_first_system = (msg.get("role") == "system" and not seen_system)
        if msg.get("role") == "system":
            seen_system = True

        converted = convert_message(msg, is_first_system)
        if converted is not None:
            converted_messages.append(converted)

    result = {"messages": converted_messages}

    # Preserve sample_name if present
    if "sample_name" in data:
        result["sample_name"] = data["sample_name"]

    return result


def process_file(input_path: str, output_path: str):
    """Process a JSONL file and convert all trajectories."""
    input_file = Path(input_path)
    output_file = Path(output_path)

    with open(input_file, 'r', encoding='utf-8') as fin, \
         open(output_file, 'w', encoding='utf-8') as fout:

        for line_num, line in enumerate(fin, 1):
            line = line.strip()
            if not line:
                continue

            try:
                data = json.loads(line)
                converted = convert_trajectory(data)
                fout.write(json.dumps(converted, ensure_ascii=False) + '\n')

                if line_num % 10 == 0:
                    print(f"Processed {line_num} trajectories...")

            except json.JSONDecodeError as e:
                print(f"Error parsing line {line_num}: {e}")
                continue
            except Exception as e:
                print(f"Error processing line {line_num}: {e}")
                continue

    print(f"Conversion complete. Output written to {output_path}")


def main():
    parser = argparse.ArgumentParser(
        description="Convert GLM-4.6 SWE-agent trajectories to XML function calling format"
    )
    parser.add_argument(
        "input",
        help="Input JSONL file with GLM trajectories"
    )
    parser.add_argument(
        "-o", "--output",
        help="Output JSONL file (default: input_converted.jsonl)"
    )

    args = parser.parse_args()

    input_path = args.input
    if args.output:
        output_path = args.output
    else:
        input_file = Path(input_path)
        output_path = str(input_file.parent / f"{input_file.stem}_converted{input_file.suffix}")

    process_file(input_path, output_path)


if __name__ == "__main__":
    main()