-
Notifications
You must be signed in to change notification settings - Fork 2.6k
fix: add Unicode support for file path mentions in slash commands #7241
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| import type { ModelInfo } from "../model.js" | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This entire file is unrelated to fixing Unicode support for file mentions. Could we separate the Featherless provider addition into a different PR to keep changes focused? |
||
|
|
||
| // Featherless AI models - https://api.featherless.ai/v1/models | ||
| export type FeatherlessModelId = keyof typeof featherlessModels | ||
|
|
||
| export const featherlessDefaultModelId: FeatherlessModelId = "meta-llama/Meta-Llama-3.1-8B-Instruct" | ||
|
|
||
| export const featherlessModels = { | ||
| "meta-llama/Meta-Llama-3.1-8B-Instruct": { | ||
| maxTokens: 8192, | ||
| contextWindow: 131072, | ||
| supportsImages: false, | ||
| supportsPromptCache: false, | ||
| inputPrice: 0.1, | ||
| outputPrice: 0.1, | ||
| description: "Meta's Llama 3.1 8B Instruct model with 128K context window", | ||
| }, | ||
| "meta-llama/Meta-Llama-3.1-70B-Instruct": { | ||
| maxTokens: 8192, | ||
| contextWindow: 131072, | ||
| supportsImages: false, | ||
| supportsPromptCache: false, | ||
| inputPrice: 0.4, | ||
| outputPrice: 0.4, | ||
| description: "Meta's Llama 3.1 70B Instruct model with 128K context window", | ||
| }, | ||
| "meta-llama/Meta-Llama-3.1-405B-Instruct": { | ||
| maxTokens: 8192, | ||
| contextWindow: 131072, | ||
| supportsImages: false, | ||
| supportsPromptCache: false, | ||
| inputPrice: 2.0, | ||
| outputPrice: 2.0, | ||
| description: "Meta's largest Llama 3.1 405B Instruct model with 128K context window", | ||
| }, | ||
| "Qwen/Qwen2.5-72B-Instruct": { | ||
| maxTokens: 8192, | ||
| contextWindow: 131072, | ||
| supportsImages: false, | ||
| supportsPromptCache: false, | ||
| inputPrice: 0.4, | ||
| outputPrice: 0.4, | ||
| description: "Alibaba's Qwen 2.5 72B Instruct model with 128K context window", | ||
| }, | ||
| "mistralai/Mistral-7B-Instruct-v0.3": { | ||
| maxTokens: 8192, | ||
| contextWindow: 32768, | ||
| supportsImages: false, | ||
| supportsPromptCache: false, | ||
| inputPrice: 0.1, | ||
| outputPrice: 0.1, | ||
| description: "Mistral's 7B Instruct v0.3 model with 32K context window", | ||
| }, | ||
| "mistralai/Mixtral-8x7B-Instruct-v0.1": { | ||
| maxTokens: 8192, | ||
| contextWindow: 32768, | ||
| supportsImages: false, | ||
| supportsPromptCache: false, | ||
| inputPrice: 0.3, | ||
| outputPrice: 0.3, | ||
| description: "Mistral's Mixtral 8x7B MoE Instruct model with 32K context window", | ||
| }, | ||
| "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B": { | ||
| maxTokens: 4096, | ||
| contextWindow: 131072, | ||
| supportsImages: false, | ||
| supportsPromptCache: false, | ||
| inputPrice: 0.05, | ||
| outputPrice: 0.05, | ||
| description: "DeepSeek R1 Distill Qwen 1.5B model with 128K context window", | ||
| }, | ||
| "moonshotai/Kimi-K2-Instruct": { | ||
| maxTokens: 16384, | ||
| contextWindow: 16384, | ||
| supportsImages: false, | ||
| supportsPromptCache: false, | ||
| inputPrice: 0.3, | ||
| outputPrice: 0.3, | ||
| description: "Moonshot AI's Kimi K2 Instruct model with tool use support", | ||
| }, | ||
| } as const satisfies Record<string, ModelInfo> | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,19 @@ | ||
| import { type FeatherlessModelId, featherlessDefaultModelId, featherlessModels } from "@roo-code/types" | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another file that's unrelated to the Unicode fix. Also, if this provider implementation stays (though it should be in a separate PR), where are the tests for it? |
||
|
|
||
| import type { ApiHandlerOptions } from "../../shared/api" | ||
|
|
||
| import { BaseOpenAiCompatibleProvider } from "./base-openai-compatible-provider" | ||
|
|
||
| export class FeatherlessHandler extends BaseOpenAiCompatibleProvider<FeatherlessModelId> { | ||
| constructor(options: ApiHandlerOptions) { | ||
| super({ | ||
| ...options, | ||
| providerName: "Featherless", | ||
| baseURL: "https://api.featherless.ai/v1", | ||
| apiKey: options.featherlessApiKey, | ||
| defaultProviderModelId: featherlessDefaultModelId, | ||
| providerModels: featherlessModels, | ||
| defaultTemperature: 0.7, | ||
| }) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,19 @@ describe("mentionRegex and mentionRegexGlobal", () => { | |
| { input: "@a1b2c3d", expected: ["@a1b2c3d"] }, // Git commit hash (short) | ||
| { input: "@a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0", expected: ["@a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b0"] }, // Git commit hash (long) | ||
|
|
||
| // Unicode file paths (Chinese, Japanese, Korean, Arabic, etc.) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great test coverage for various Unicode scripts! Consider adding edge cases like:
|
||
| { input: "@/路径/中文文件.txt", expected: ["@/路径/中文文件.txt"] }, // Chinese characters | ||
| { input: "@/パス/日本語ファイル.txt", expected: ["@/パス/日本語ファイル.txt"] }, // Japanese characters | ||
| { input: "@/경로/한국어파일.txt", expected: ["@/경로/한국어파일.txt"] }, // Korean characters | ||
| { input: "@/مسار/ملف_عربي.txt", expected: ["@/مسار/ملف_عربي.txt"] }, // Arabic characters | ||
| { input: "@/путь/русский_файл.txt", expected: ["@/путь/русский_файл.txt"] }, // Cyrillic characters | ||
| { input: "@/dossier/fichier_français.txt", expected: ["@/dossier/fichier_français.txt"] }, // French accents | ||
| { input: "@/carpeta/archivo_español.txt", expected: ["@/carpeta/archivo_español.txt"] }, // Spanish characters | ||
| { input: "@/文件夹/", expected: ["@/文件夹/"] }, // Chinese folder | ||
| { input: "@/mixed/中文_english_日本語.txt", expected: ["@/mixed/中文_english_日本語.txt"] }, // Mixed languages | ||
| { input: "@/emoji/file_😀_test.txt", expected: ["@/emoji/file_😀_test.txt"] }, // Emoji in filename | ||
| { input: "@/path/文件\\ with\\ 空格.txt", expected: ["@/path/文件\\ with\\ 空格.txt"] }, // Unicode with escaped spaces | ||
|
|
||
| // Mentions within text | ||
| { | ||
| input: "Check file @/path/to/file\\ with\\ spaces.txt for details.", | ||
|
|
@@ -33,6 +46,8 @@ describe("mentionRegex and mentionRegexGlobal", () => { | |
| { input: "See @problems and @terminal output.", expected: ["@problems", "@terminal"] }, | ||
| { input: "URL: @https://example.com.", expected: ["@https://example.com"] }, // Trailing punctuation | ||
| { input: "Commit @a1b2c3d, then check @/file.txt", expected: ["@a1b2c3d", "@/file.txt"] }, | ||
| { input: "Check @/文档/报告.pdf for details", expected: ["@/文档/报告.pdf"] }, // Unicode in sentence | ||
| { input: "Files: @/файл1.txt, @/файл2.txt", expected: ["@/файл1.txt", "@/файл2.txt"] }, // Multiple Unicode mentions | ||
|
|
||
| // Negative cases (should not match or match partially) | ||
| { input: "@/path/with unescaped space.txt", expected: ["@/path/with"] }, // Unescaped space | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,31 +1,31 @@ | ||
| /* | ||
| Mention regex: | ||
| - **Purpose**: | ||
| - To identify and highlight specific mentions in text that start with '@'. | ||
| - These mentions can be file paths, URLs, or the exact word 'problems'. | ||
| - **Purpose**: | ||
| - To identify and highlight specific mentions in text that start with '@'. | ||
| - These mentions can be file paths (including Unicode characters), URLs, or specific keywords. | ||
| - Ensures that trailing punctuation marks (like commas, periods, etc.) are not included in the match, allowing punctuation to follow the mention without being part of it. | ||
|
|
||
| - **Regex Breakdown**: | ||
| - `/@`: | ||
| - `/@`: | ||
| - **@**: The mention must start with the '@' symbol. | ||
|
|
||
| - `((?:\/|\w+:\/\/)[^\s]+?|problems\b|git-changes\b)`: | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this intentional? The comment in line 12 still shows the old regex pattern structure without mentioning the Unicode flag. Since we're updating the documentation to mention Unicode support, shouldn't we also update the actual pattern shown in the breakdown to reflect the 'u' flag that's now part of the pattern? |
||
| - **Capturing Group (`(...)`)**: Captures the part of the string that matches one of the specified patterns. | ||
| - `(?:\/|\w+:\/\/)`: | ||
| - `(?:\/|\w+:\/\/)`: | ||
| - **Non-Capturing Group (`(?:...)`)**: Groups the alternatives without capturing them for back-referencing. | ||
| - `\/`: | ||
| - `\/`: | ||
| - **Slash (`/`)**: Indicates that the mention is a file or folder path starting with a '/'. | ||
| - `|`: Logical OR. | ||
| - `\w+:\/\/`: | ||
| - **Protocol (`\w+://`)**: Matches URLs that start with a word character sequence followed by '://', such as 'http://', 'https://', 'ftp://', etc. | ||
| - `(?:[^\s\\]|\\ )+?`: | ||
| - **Non-Capturing Group (`(?:...)`)**: Groups the alternatives without capturing them. | ||
| - **Non-Whitespace and Non-Backslash (`[^\s\\]`)**: Matches any character that is not whitespace or a backslash. | ||
| - **Non-Whitespace and Non-Backslash (`[^\s\\]`)**: Matches any character that is not whitespace or a backslash, including Unicode characters. | ||
| - **OR (`|`)**: Logical OR. | ||
| - **Escaped Space (`\\ `)**: Matches a backslash followed by a space (an escaped space). | ||
| - **Non-Greedy (`+?`)**: Ensures the smallest possible match, preventing the inclusion of trailing punctuation. | ||
| - `|`: Logical OR. | ||
| - `problems\b`: | ||
| - `problems\b`: | ||
| - **Exact Word ('problems')**: Matches the exact word 'problems'. | ||
| - **Word Boundary (`\b`)**: Ensures that 'problems' is matched as a whole word and not as part of another word (e.g., 'problematic'). | ||
| - `|`: Logical OR. | ||
|
|
@@ -34,28 +34,29 @@ Mention regex: | |
| - **Word Boundary (`\b`)**: Ensures that 'terminal' is matched as a whole word and not as part of another word (e.g., 'terminals'). | ||
| - `(?=[.,;:!?]?(?=[\s\r\n]|$))`: | ||
| - **Positive Lookahead (`(?=...)`)**: Ensures that the match is followed by specific patterns without including them in the match. | ||
| - `[.,;:!?]?`: | ||
| - `[.,;:!?]?`: | ||
| - **Optional Punctuation (`[.,;:!?]?`)**: Matches zero or one of the specified punctuation marks. | ||
| - `(?=[\s\r\n]|$)`: | ||
| - `(?=[\s\r\n]|$)`: | ||
| - **Nested Positive Lookahead (`(?=[\s\r\n]|$)`)**: Ensures that the punctuation (if present) is followed by a whitespace character, a line break, or the end of the string. | ||
|
|
||
| - **Summary**: | ||
| - The regex effectively matches: | ||
| - Mentions that are file or folder paths starting with '/' and containing any non-whitespace characters (including periods within the path). | ||
| - File paths can include spaces if they are escaped with a backslash (e.g., `@/path/to/file\ with\ spaces.txt`). | ||
| - Mentions that are file or folder paths starting with '/' and containing any non-whitespace characters, including Unicode characters like Chinese, Japanese, Korean, etc. | ||
| - File paths can include spaces if they are escaped with a backslash (e.g., `@/path/to/file\ with\ spaces.txt` or `@/路径/中文文件.txt`). | ||
| - URLs that start with a protocol (like 'http://') followed by any non-whitespace characters (including query parameters). | ||
| - The exact word 'problems'. | ||
| - The exact word 'git-changes'. | ||
| - The exact word 'terminal'. | ||
| - It ensures that any trailing punctuation marks (such as ',', '.', '!', etc.) are not included in the matched mention, allowing the punctuation to follow the mention naturally in the text. | ||
| - The 'u' flag enables full Unicode support, allowing the regex to properly match Unicode characters in file paths. | ||
|
|
||
| - **Global Regex**: | ||
| - `mentionRegexGlobal`: Creates a global version of the `mentionRegex` to find all matches within a given string. | ||
| - `mentionRegexGlobal`: Creates a global version of the `mentionRegex` with Unicode support to find all matches within a given string. | ||
|
|
||
| */ | ||
| export const mentionRegex = | ||
| /(?<!\\)@((?:\/|\w+:\/\/)(?:[^\s\\]|\\ )+?|[a-f0-9]{7,40}\b|problems\b|git-changes\b|terminal\b)(?=[.,;:!?]?(?=[\s\r\n]|$))/ | ||
| export const mentionRegexGlobal = new RegExp(mentionRegex.source, "g") | ||
| /(?<!\\)@((?:\/|\w+:\/\/)(?:[^\s\\]|\\ )+?|[a-f0-9]{7,40}\b|problems\b|git-changes\b|terminal\b)(?=[.,;:!?]?(?=[\s\r\n]|$))/u | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Has the performance impact of the Unicode flag been tested with large texts containing many mentions? The 'u' flag can affect regex performance, especially with complex patterns and long strings. |
||
| export const mentionRegexGlobal = new RegExp(mentionRegex.source, "gu") | ||
|
|
||
| // Regex to match command mentions like /command-name anywhere in text | ||
| export const commandRegexGlobal = /(?:^|\s)\/([a-zA-Z0-9_\.-]+)(?=\s|$)/g | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems unrelated to the Unicode file mention fix described in the PR. Adding a new Featherless provider is a completely separate feature that should be in its own PR. Mixing unrelated changes makes it harder to review and can introduce unexpected issues.