FEATURE: add context and llm controls to researcher, fix username filter (#1401)

SamSaffron · web-flow · commit 3e74eea1e5e3 · 2025-06-04T16:39:43.000+10:00
Adds context length controls to researcher (max tokens per post and batch)
Allow picking LLM for researcher
Fix bug where unicode usernames were not working
Fix documentation of OR logic
diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml
@@ -344,6 +344,15 @@ en:
       searching: "Searching for: '%{query}'"
       tool_options:
         researcher:
+          researcher_llm:
+            name: "LLM"
+            description: "Language model to use for research (default to current persona's LLM)"
+          max_tokens_per_batch:
+            name: "Maximum tokens per batch"
+            description: "Maximum number of tokens to use for each batch in the research"
+          max_tokens_per_post:
+            name: "Maximum tokens per post"
+            description: "Maximum number of tokens to use for each post in the research"
           max_results:
             name: "Maximum number of results"
             description: "Maximum number of results to include in a filter"
diff --git a/lib/personas/tools/researcher.rb b/lib/personas/tools/researcher.rb
@@ -31,26 +31,28 @@ def signature
 
           def filter_description
             <<~TEXT
-              Filter string to target specific content.
-              - Supports user (@username)
-              - post_type:first - only includes first posts in topics
-              - post_type:reply - only replies in topics
-              - date ranges (after:YYYY-MM-DD, before:YYYY-MM-DD for posts; topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD for topics)
-              - categories (category:category1,category2 or categories:category1,category2)
-              - tags (tag:tag1,tag2 or tags:tag1,tag2)
-              - groups (group:group1,group2 or groups:group1,group2)
-              - status (status:open, status:closed, status:archived, status:noreplies, status:single_user)
-              - keywords (keywords:keyword1,keyword2) - searches for specific words within post content using full-text search
-              - topic_keywords (topic_keywords:keyword1,keyword2) - searches for keywords within topics, returns all posts from matching topics
-              - topics (topic:topic_id1,topic_id2 or topics:topic_id1,topic_id2) - target specific topics by ID
-              - max_results (max_results:10) - limits the maximum number of results returned (optional)
-              - order (order:latest, order:oldest, order:latest_topic, order:oldest_topic, order:likes) - controls result ordering (optional, defaults to latest posts)
-
-              Multiple filters can be combined with spaces for AND logic. Example: '@sam after:2023-01-01 tag:feature'
-
-              Use OR to combine filter segments for inclusive logic.
-              Example: 'category:feature,bug OR tag:feature-tag' - includes posts in feature OR bug categories, OR posts with feature-tag tag
-              Example: '@sam category:bug' - includes posts by @sam AND in bug category
+              Filter string to target specific content. Space-separated filters use AND logic, OR creates separate filter groups.
+
+              **Filters:**
+              - username:user1 or usernames:user1,user2 - posts by specific users
+              - group:group1 or groups:group1,group2 - posts by users in specific groups
+              - post_type:first|reply - first posts only or replies only
+              - keywords:word1,word2 - full-text search in post content
+              - topic_keywords:word1,word2 - full-text search in topics (returns all posts from matching topics)
+              - topic:123 or topics:123,456 - specific topics by ID
+              - category:name1 or categories:name1,name2 - posts in categories (by name/slug)
+              - tag:tag1 or tags:tag1,tag2 - posts in topics with tags
+              - after:YYYY-MM-DD, before:YYYY-MM-DD - filter by post creation date
+              - topic_after:YYYY-MM-DD, topic_before:YYYY-MM-DD - filter by topic creation date
+              - status:open|closed|archived|noreplies|single_user - topic status filters
+              - max_results:N - limit results (per OR group)
+              - order:latest|oldest|latest_topic|oldest_topic|likes - sort order
+
+              **OR Logic:** Each OR group processes independently - filters don't cross boundaries.
+
+              Examples:
+              - 'username:sam after:2023-01-01' - sam's posts after date
+              - 'max_results:50 category:bugs OR tag:urgent' - (≤50 bug posts) OR (all urgent posts)
             TEXT
           end
 
@@ -60,9 +62,11 @@ def name
 
           def accepted_options
             [
+              option(:researcher_llm, type: :llm),
               option(:max_results, type: :integer),
               option(:include_private, type: :boolean),
               option(:max_tokens_per_post, type: :integer),
+              option(:max_tokens_per_batch, type: :integer),
             ]
           end
         end
@@ -134,17 +138,32 @@ def description_args
         protected
 
         MIN_TOKENS_FOR_RESEARCH = 8000
+        MIN_TOKENS_FOR_POST = 50
+
         def process_filter(filter, goals, post, &blk)
-          if llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
+          if researcher_llm.max_prompt_tokens < MIN_TOKENS_FOR_RESEARCH
             raise ArgumentError,
                   "LLM max tokens too low for research. Minimum is #{MIN_TOKENS_FOR_RESEARCH}."
           end
+
+          max_tokens_per_batch = options[:max_tokens_per_batch].to_i
+          if max_tokens_per_batch <= MIN_TOKENS_FOR_RESEARCH
+            max_tokens_per_batch = researcher_llm.max_prompt_tokens - 2000
+          end
+
+          max_tokens_per_post = options[:max_tokens_per_post]
+          if max_tokens_per_post.nil?
+            max_tokens_per_post = 2000
+          elsif max_tokens_per_post < MIN_TOKENS_FOR_POST
+            max_tokens_per_post = MIN_TOKENS_FOR_POST
+          end
+
           formatter =
             DiscourseAi::Utils::Research::LlmFormatter.new(
               filter,
-              max_tokens_per_batch: llm.max_prompt_tokens - 2000,
-              tokenizer: llm.tokenizer,
-              max_tokens_per_post: options[:max_tokens_per_post] || 2000,
+              max_tokens_per_batch: max_tokens_per_batch,
+              tokenizer: researcher_llm.tokenizer,
+              max_tokens_per_post: max_tokens_per_post,
             )
 
           results = []
@@ -164,6 +183,14 @@ def process_filter(filter, goals, post, &blk)
           end
         end
 
+        def researcher_llm
+          @researcher_llm ||=
+            (
+              options[:researcher_llm].present? &&
+                LlmModel.find_by(id: options[:researcher_llm].to_i)&.to_llm
+            ) || self.llm
+        end
+
         def run_inference(chunk_text, goals, post, &blk)
           return if context.cancel_manager&.cancelled?
 
@@ -179,7 +206,7 @@ def run_inference(chunk_text, goals, post, &blk)
             )
 
           results = []
-          llm.generate(
+          researcher_llm.generate(
             prompt,
             user: post.user,
             feature_name: context.feature_name,
diff --git a/lib/utils/research/filter.rb b/lib/utils/research/filter.rb
@@ -153,12 +153,12 @@ def self.word_to_date(str)
           end
         end
 
-        register_filter(/\A\@(\w+)\z/i) do |relation, username, filter|
-          user = User.find_by(username_lower: username.downcase)
-          if user
-            relation.where("posts.user_id = ?", user.id)
+        register_filter(/\Ausernames?:(.+)\z/i) do |relation, username, filter|
+          user_ids = User.where(username_lower: username.split(",").map(&:downcase)).pluck(:id)
+          if user_ids.empty?
+            relation.where("1 = 0")
           else
-            relation.where("1 = 0") # No results if user doesn't exist
+            relation.where("posts.user_id IN (?)", user_ids)
           end
         end
 
diff --git a/spec/lib/personas/tools/researcher_spec.rb b/spec/lib/personas/tools/researcher_spec.rb
@@ -21,6 +21,54 @@
 
   before { SiteSetting.ai_bot_enabled = true }
 
+  it "uses custom researcher_llm and applies token limits correctly" do
+    # Create a second LLM model to test the researcher_llm option
+    secondary_llm_model = Fabricate(:llm_model, name: "secondary_model")
+
+    # Create test content with long text to test token truncation
+    topic = Fabricate(:topic, category: category, tags: [tag_research])
+    long_content = "zz " * 100 # This will exceed our token limit
+    _test_post =
+      Fabricate(:post, topic: topic, raw: long_content, user: user, skip_validation: true)
+
+    prompts = nil
+    responses = [["Research completed"]]
+    researcher = nil
+
+    DiscourseAi::Completions::Llm.with_prepared_responses(
+      responses,
+      llm: secondary_llm_model,
+    ) do |_, _, _prompts|
+      researcher =
+        described_class.new(
+          { filter: "category:research-category", goals: "analyze test content", dry_run: false },
+          persona_options: {
+            "researcher_llm" => secondary_llm_model.id,
+            "max_tokens_per_post" => 50, # Very small to force truncation
+            "max_tokens_per_batch" => 8000,
+          },
+          bot_user: bot_user,
+          llm: nil,
+          context: DiscourseAi::Personas::BotContext.new(user: user, post: post),
+        )
+
+      results = researcher.invoke(&progress_blk)
+
+      expect(results[:dry_run]).to eq(false)
+      expect(results[:results]).to be_present
+
+      prompts = _prompts
+    end
+
+    expect(prompts).to be_present
+
+    user_message = prompts.first.messages.find { |m| m[:type] == :user }
+    expect(user_message[:content]).to be_present
+
+    # count how many times the the "zz " appears in the content (a bit of token magic, we lose a couple cause we redact)
+    expect(user_message[:content].scan("zz ").count).to eq(48)
+  end
+
   describe "#invoke" do
     it "can correctly filter to a topic id" do
       researcher =
@@ -104,7 +152,7 @@
       researcher =
         described_class.new(
           {
-            filter: "category:research-category @#{user.username}",
+            filter: "category:research-category username:#{user.username}",
             goals: "find relevant content",
             dry_run: false,
           },
@@ -129,7 +177,7 @@
 
       expect(results[:dry_run]).to eq(false)
       expect(results[:goals]).to eq("find relevant content")
-      expect(results[:filter]).to eq("category:research-category @#{user.username}")
+      expect(results[:filter]).to eq("category:research-category username:#{user.username}")
       expect(results[:results].first).to include("Found: Relevant content 1")
     end
   end
diff --git a/spec/lib/utils/research/filter_spec.rb b/spec/lib/utils/research/filter_spec.rb
@@ -144,6 +144,21 @@
       end
     end
 
+    describe "can find posts by users even with unicode usernames" do
+      before { SiteSetting.unicode_usernames = true }
+      let!(:unicode_user) { Fabricate(:user, username: "aאb") }
+
+      it "can filter by unicode usernames" do
+        post = Fabricate(:post, user: unicode_user, topic: feature_topic)
+        filter = described_class.new("username:aאb")
+        expect(filter.search.pluck(:id)).to contain_exactly(post.id)
+
+        filter = described_class.new("usernames:aאb,#{user.username}")
+        posts_ids = Post.where(user_id: [unicode_user.id, user.id]).pluck(:id)
+        expect(filter.search.pluck(:id)).to contain_exactly(*posts_ids)
+      end
+    end
+
     describe "category filtering" do
       it "correctly filters posts by categories" do
         filter = described_class.new("category:Announcements")