|
7 | 7 | from dify_plugin.interfaces.agent import AgentScratchpadUnit |
8 | 8 |
|
9 | 9 | PREFIX_DELIMITERS = frozenset({"\n", " ", ""}) |
| 10 | +# Tags injected by Gemini when include_thoughts=True; stripped so ReAct sees only Thought:/Action:/FinalAnswer: |
| 11 | +THINK_START = "<think>" |
| 12 | +THINK_END = "</think>" |
10 | 13 |
|
11 | 14 |
|
12 | 15 | class ReactState(Enum): |
@@ -115,11 +118,63 @@ def step(self, delta: str) -> tuple[bool, ReactChunk | None, bool, bool]: |
115 | 118 | answer_matcher = PrefixMatcher(ReactState.ANSWER) |
116 | 119 | thought_matcher = PrefixMatcher(ReactState.THINKING) |
117 | 120 |
|
| 121 | + _in_think = False |
| 122 | + _think_buf = "" |
| 123 | + _think_depth = 0 |
118 | 124 | for response in llm_response: |
119 | 125 | if response.delta.usage: |
120 | 126 | usage_dict["usage"] = response.delta.usage |
121 | | - response_content = response.delta.message.content |
122 | | - if not isinstance(response_content, str): |
| 127 | + raw = response.delta.message.content |
| 128 | + if isinstance(raw, str): |
| 129 | + response_content = raw |
| 130 | + elif isinstance(raw, list): |
| 131 | + # Plugins (e.g. Gemini) send content as list; some items may be non-text (e.g. image) |
| 132 | + parts = [ |
| 133 | + s |
| 134 | + for c in raw |
| 135 | + if isinstance(s := (getattr(c, "data", None) or getattr(c, "text", None)), str) |
| 136 | + ] |
| 137 | + response_content = "".join(parts) |
| 138 | + else: |
| 139 | + continue |
| 140 | + if not response_content: |
| 141 | + continue |
| 142 | + # When include_thoughts=True, Gemini injects <think>...</think>; strip across chunks so |
| 143 | + # ReAct parser only sees Thought:/Action:/FinalAnswer: from the model reply. |
| 144 | + # Nested <think> tags are supported via a depth counter. |
| 145 | + if THINK_START in response_content or THINK_END in response_content or _in_think: |
| 146 | + buf = _think_buf + response_content |
| 147 | + _think_buf = "" |
| 148 | + out = [] |
| 149 | + i = 0 |
| 150 | + while i < len(buf): |
| 151 | + if _in_think: |
| 152 | + end_j = buf.find(THINK_END, i) |
| 153 | + start_j = buf.find(THINK_START, i) |
| 154 | + if end_j == -1 and start_j == -1: |
| 155 | + _think_buf = buf[i:] |
| 156 | + break |
| 157 | + if start_j != -1 and (end_j == -1 or start_j < end_j): |
| 158 | + _think_depth += 1 |
| 159 | + i = start_j + len(THINK_START) |
| 160 | + else: |
| 161 | + j = end_j |
| 162 | + _think_depth -= 1 |
| 163 | + if _think_depth <= 0: |
| 164 | + _in_think = False |
| 165 | + _think_depth = 0 |
| 166 | + i = j + len(THINK_END) |
| 167 | + else: |
| 168 | + j = buf.find(THINK_START, i) |
| 169 | + if j == -1: |
| 170 | + out.append(buf[i:]) |
| 171 | + break |
| 172 | + out.append(buf[i:j]) |
| 173 | + _in_think = True |
| 174 | + _think_depth = 1 |
| 175 | + i = j + len(THINK_START) |
| 176 | + response_content = "".join(out) |
| 177 | + if not response_content: |
123 | 178 | continue |
124 | 179 |
|
125 | 180 | # stream |
|
0 commit comments