knowledge: add url source alias option (#427)

hyprh · web-flow · commit ab21c3344bd9 · 2025-09-28T11:45:31.000+08:00
diff --git a/docs/mkdocs/en/knowledge.md b/docs/mkdocs/en/knowledge.md
@@ -348,6 +348,17 @@ urlSrc := urlsource.New(
     urlsource.WithName("Web Content"),
 )
 
+// URL source advanced configuration: Separate content fetching and document identification
+urlSrcAlias := urlsource.New(
+    []string{"https://trpc-go.com/docs/api.md"},     // Identifier URL (for document ID and metadata)
+    urlsource.WithContentFetchingURL([]string{"https://github.com/trpc-group/trpc-go/raw/main/docs/api.md"}), // Actual content fetching URL
+    urlsource.WithName("TRPC API Docs"),
+    urlsource.WithMetadataValue("source", "github"),
+)
+// Note: When using WithContentFetchingURL, the identifier URL should preserve the file information from the content fetching URL, for example:
+// Correct: Identifier URL is https://trpc-go.com/docs/api.md, fetch URL is https://github.com/.../docs/api.md
+// Incorrect: Identifier URL is https://trpc-go.com, which loses document path information
+
 // Auto source: Intelligent type recognition, automatically select processor.
 autoSrc := autosource.New(
     []string{
diff --git a/docs/mkdocs/zh/knowledge.md b/docs/mkdocs/zh/knowledge.md
@@ -346,6 +346,17 @@ urlSrc := urlsource.New(
     urlsource.WithName("Web Content"),
 )
 
+// URL 源高级配置：分离内容获取和文档标识
+urlSrcAlias := urlsource.New(
+    []string{"https://trpc-go.com/docs/api.md"},     // 标识符 URL（用于文档 ID 和元数据）
+    urlsource.WithContentFetchingURL([]string{"https://github.com/trpc-group/trpc-go/raw/main/docs/api.md"}), // 实际内容获取 URL
+    urlsource.WithName("TRPC API Docs"),
+    urlsource.WithMetadataValue("source", "github"),
+)
+// 注意：使用 WithContentFetchingURL 时，标识符 URL 应保留获取内容的URL的文件信息，比如
+// 正确：标识符 URL 为 https://trpc-go.com/docs/api.md，获取 URL 为 https://github.com/.../docs/api.md
+// 错误：标识符 URL 为 https://trpc-go.com，会丢失文档路径信息
+
 // 自动源：智能识别类型，自动选择处理器
 autoSrc := autosource.New(
     []string{
diff --git a/examples/knowledge/management/README.md b/examples/knowledge/management/README.md
@@ -331,11 +331,28 @@ Source: GolangDocSource
 
 ### Default Sources
 
-The application automatically loads these default sources from the `data/` directory:
-- `golang.md`: Go programming language documentation
-- `llm.md`: Large language model concepts and terminology
-
-The system supports any text-based formats including:
-- Markdown (.md)
-- Text files (.txt) 
-- Other text-based formats with proper parsing
+The application automatically loads these default sources:
+
+#### File Sources (from `../exampledata/file/` directory):
+- **LLMDocSource** (`llm.md`): Large language model concepts and terminology
+  - Metadata: `tag: "llm"`
+- **GolangDocSource** (`golang.md`): Go programming language documentation  
+  - Metadata: `tag: "golang"`
+
+#### URL Sources:
+- **Byte-pair** (urlSource1): Wikipedia article on Byte-pair encoding
+  - URL: `https://en.wikipedia.org/wiki/Byte-pair_encoding`
+  - Metadata: `tag: "wiki"`
+  - Note: Direct URL fetching - same URL used for both content fetching and document ID generation
+
+- **trpc-go** (urlSource2): Demonstrates URL source with separate fetch and identifier URLs
+  - Identifier URL: `https://trpc-go.com/Byte-pair_encoding` (used for metadata and document ID)
+  - Content Fetch URL: `https://en.wikipedia.org/wiki/Byte-pair_encoding` (actual content source)
+  - Metadata: `tag: "wiki"`
+  - Note: This example shows how to use `WithContentFetchingURL()` to specify different URLs for content fetching vs. document identification
+  
+  **Notice**: When using `WithContentFetchingURL()`, the identifier URL should maintain the same path structure as the fetch URL:
+  - ✅ **Correct**: If fetch URL is `http://example.com/hello.md`, identifier URL should be `http://trpc-go.com/hello.md`
+  - ❌ **Incorrect**: Using `http://trpc-go.com` as identifier URL loses the document path information
+  - This ensures proper document identification and metadata consistency
+
diff --git a/examples/knowledge/management/main.go b/examples/knowledge/management/main.go
@@ -27,6 +27,7 @@ import (
 	geminiembedder "trpc.group/trpc-go/trpc-agent-go/knowledge/embedder/gemini"
 	openaiembedder "trpc.group/trpc-go/trpc-agent-go/knowledge/embedder/openai"
 	"trpc.group/trpc-go/trpc-agent-go/knowledge/source/file"
+	"trpc.group/trpc-go/trpc-agent-go/knowledge/source/url"
 
 	// Source.
 	"trpc.group/trpc-go/trpc-agent-go/knowledge/source"
@@ -446,14 +447,24 @@ func (chat *knowledgeChat) setupKnowledgeBase() error {
 		file.WithName("GolangDocSource"),
 		file.WithMetadata(map[string]interface{}{"tag": "golang"}),
 	)
+	urlSource1 := url.New(
+		[]string{"https://en.wikipedia.org/wiki/Byte-pair_encoding"},
+		url.WithName("Byte-pair"),
+		url.WithMetadataValue("tag", "wiki"),
+	)
+	urlSource2 := url.New(
+		[]string{"https://trpc-go.com/Byte-pair_encoding"}, // contentFetchURL is configured, this url will be used to generate meta data and docID
+		url.WithName("trpc-go"),
+		url.WithContentFetchingURL([]string{"https://en.wikipedia.org/wiki/Byte-pair_encoding"}), // real url that fetching data
+		url.WithMetadataValue("tag", "wiki"),
+	)
 
-	chat.source = []source.Source{fileSource1, fileSource2}
-
+	chat.source = []source.Source{fileSource1, fileSource2, urlSource1, urlSource2}
 	// Create knowledge base
 	chat.knowledge = knowledge.New(
 		knowledge.WithEmbedder(embedder),
 		knowledge.WithVectorStore(vs),
-		knowledge.WithSources([]source.Source{fileSource1, fileSource2}),
+		knowledge.WithSources([]source.Source{fileSource1, fileSource2, urlSource1, urlSource2}),
 		knowledge.WithEnableSourceSync(*sourceSync),
 	)
 
diff --git a/knowledge/source/url/options.go b/knowledge/source/url/options.go
@@ -22,6 +22,14 @@ func WithName(name string) Option {
 	}
 }
 
+// WithContentFetchingURL sets the real content fetching URL for the source.
+// The real content fetching URL is used to fetch the actual content of the document.
+func WithContentFetchingURL(url []string) Option {
+	return func(s *Source) {
+		s.fetchURLs = url
+	}
+}
+
 // WithMetadata sets additional metadata for the source.
 func WithMetadata(metadata map[string]interface{}) Option {
 	return func(s *Source) {
diff --git a/knowledge/source/url/url_source.go b/knowledge/source/url/url_source.go
@@ -34,24 +34,25 @@ var defaultClient = &http.Client{Timeout: 30 * time.Second}
 
 // Source represents a knowledge source for URL-based content.
 type Source struct {
-	urls         []string
-	name         string
-	metadata     map[string]interface{}
-	readers      map[string]reader.Reader
-	httpClient   *http.Client
-	chunkSize    int
-	chunkOverlap int
+	identifierURLs []string // url, used to generate document ID and check update of document.
+	fetchURLs      []string // fetching url , the actual used to fetch content.
+	name           string
+	metadata       map[string]interface{}
+	readers        map[string]reader.Reader
+	httpClient     *http.Client
+	chunkSize      int
+	chunkOverlap   int
 }
 
 // New creates a new URL knowledge source.
 func New(urls []string, opts ...Option) *Source {
 	s := &Source{
-		urls:         urls,
-		name:         defaultURLSourceName,
-		metadata:     make(map[string]interface{}),
-		httpClient:   defaultClient,
-		chunkSize:    0,
-		chunkOverlap: 0,
+		identifierURLs: urls,
+		name:           defaultURLSourceName,
+		metadata:       make(map[string]interface{}),
+		httpClient:     defaultClient,
+		chunkSize:      0,
+		chunkOverlap:   0,
 	}
 
 	// Apply options first (capture chunk config).
@@ -69,16 +70,24 @@ func New(urls []string, opts ...Option) *Source {
 
 // ReadDocuments downloads content from all URLs and returns documents using appropriate readers.
 func (s *Source) ReadDocuments(ctx context.Context) ([]*document.Document, error) {
-	if len(s.urls) == 0 {
+	if len(s.identifierURLs) == 0 {
 		return nil, nil // Skip if no URLs provided.
 	}
 
+	if len(s.fetchURLs) > 0 && len(s.identifierURLs) != len(s.fetchURLs) {
+		return nil, fmt.Errorf("fetchURLs and urls must have the same count")
+	}
+
 	var allDocuments []*document.Document
 
-	for _, urlStr := range s.urls {
-		documents, err := s.processURL(urlStr)
+	for i, identifierURL := range s.identifierURLs {
+		fetchingURL := identifierURL
+		if len(s.fetchURLs) > 0 {
+			fetchingURL = s.fetchURLs[i]
+		}
+		documents, err := s.processURL(fetchingURL, identifierURL)
 		if err != nil {
-			return nil, fmt.Errorf("failed to process URL %s: %w", urlStr, err)
+			return nil, fmt.Errorf("failed to process URL %s: %w", identifierURL, err)
 		}
 		allDocuments = append(allDocuments, documents...)
 	}
@@ -97,15 +106,21 @@ func (s *Source) Type() string {
 }
 
 // processURL downloads content from a URL and returns its documents.
-func (s *Source) processURL(urlStr string) ([]*document.Document, error) {
+func (s *Source) processURL(fetchingURL string, identifierURL string) ([]*document.Document, error) {
 	// Parse the URL.
-	parsedURL, err := url.Parse(urlStr)
+	_, err := url.Parse(fetchingURL)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse fetching URL: %w", err)
+	}
+
+	// Parse and validate the identifier URL.
+	parsedIdentifierURL, err := url.Parse(identifierURL)
 	if err != nil {
-		return nil, fmt.Errorf("failed to parse URL: %w", err)
+		return nil, fmt.Errorf("failed to parse identifier URL: %w", err)
 	}
 
 	// Create HTTP request with context.
-	req, err := http.NewRequest("GET", urlStr, nil)
+	req, err := http.NewRequest("GET", fetchingURL, nil)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create request: %w", err)
 	}
@@ -126,8 +141,7 @@ func (s *Source) processURL(urlStr string) ([]*document.Document, error) {
 
 	// Determine the content type and file name.
 	contentType := resp.Header.Get("Content-Type")
-	fileName := s.getFileName(parsedURL, contentType)
-
+	fileName := s.getFileName(parsedIdentifierURL, contentType)
 	// Determine file type and get appropriate reader.
 	fileType := isource.GetFileTypeFromContentType(contentType, fileName)
 	reader, exists := s.readers[fileType]
@@ -147,12 +161,11 @@ func (s *Source) processURL(urlStr string) ([]*document.Document, error) {
 		metadata[k] = v
 	}
 	metadata[source.MetaSource] = source.TypeURL
-	metadata[source.MetaURL] = urlStr
-	metadata[source.MetaURLHost] = parsedURL.Host
-	metadata[source.MetaURLPath] = parsedURL.Path
-	metadata[source.MetaURLScheme] = parsedURL.Scheme
-
-	metadata[source.MetaURI] = urlStr
+	metadata[source.MetaURL] = identifierURL
+	metadata[source.MetaURLHost] = parsedIdentifierURL.Host
+	metadata[source.MetaURLPath] = parsedIdentifierURL.Path
+	metadata[source.MetaURLScheme] = parsedIdentifierURL.Scheme
+	metadata[source.MetaURI] = identifierURL
 	metadata[source.MetaSourceName] = s.name
 
 	// Add metadata to all documents.
diff --git a/knowledge/source/url/url_source_test.go b/knowledge/source/url/url_source_test.go
@@ -17,6 +17,7 @@ import (
 	"strings"
 	"testing"
 
+	"trpc.group/trpc-go/trpc-agent-go/knowledge/document"
 	"trpc.group/trpc-go/trpc-agent-go/knowledge/source"
 )
 
@@ -189,3 +190,115 @@ func TestSetMetadataMultiple(t *testing.T) {
 		}
 	}
 }
+
+// TestWithContentFetchingURL verifies the WithContentFetchingURL option functionality.
+func TestWithContentFetchingURL(t *testing.T) {
+	ctx := context.Background()
+
+	// Content for different servers
+	identifierContent := "This is identifier content"
+	fetchContent := "This is fetch content"
+
+	// Create identifier server
+	identifierServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+		_, _ = w.Write([]byte(identifierContent))
+	}))
+	defer identifierServer.Close()
+
+	// Create fetch server
+	fetchServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.Header().Set("Content-Type", "text/plain; charset=utf-8")
+		_, _ = w.Write([]byte(fetchContent))
+	}))
+	defer fetchServer.Close()
+
+	tests := []struct {
+		name           string
+		setupSource    func() *Source
+		expectedError  bool
+		validateResult func(t *testing.T, docs []*document.Document)
+	}{
+		{
+			name: "basic_content_fetching_url",
+			setupSource: func() *Source {
+				return New(
+					[]string{identifierServer.URL + "/doc.txt"},
+					WithContentFetchingURL([]string{fetchServer.URL + "/doc.txt"}),
+				)
+			},
+			expectedError: false,
+			validateResult: func(t *testing.T, docs []*document.Document) {
+				if len(docs) == 0 {
+					t.Fatal("expected at least one document")
+				}
+				// Content should come from fetch server
+				if !strings.Contains(docs[0].Content, fetchContent) {
+					t.Errorf("expected content from fetch server, got: %s", docs[0].Content)
+				}
+				// Metadata should use identifier URL
+				if metaURL, ok := docs[0].Metadata[source.MetaURL].(string); !ok || !strings.Contains(metaURL, identifierServer.URL) {
+					t.Errorf("expected metadata URL to be identifier URL, got: %v", metaURL)
+				}
+			},
+		},
+		{
+			name: "mismatched_url_count",
+			setupSource: func() *Source {
+				return New(
+					[]string{identifierServer.URL + "/doc1.txt", identifierServer.URL + "/doc2.txt"},
+					WithContentFetchingURL([]string{fetchServer.URL + "/doc1.txt"}), // Only one fetch URL for two identifier URLs
+				)
+			},
+			expectedError: true,
+			validateResult: func(t *testing.T, docs []*document.Document) {
+				// Should not reach here due to error
+			},
+		},
+		{
+			name: "multiple_urls_with_fetching",
+			setupSource: func() *Source {
+				return New(
+					[]string{identifierServer.URL + "/doc1.txt", identifierServer.URL + "/doc2.txt"},
+					WithContentFetchingURL([]string{fetchServer.URL + "/doc1.txt", fetchServer.URL + "/doc2.txt"}),
+				)
+			},
+			expectedError: false,
+			validateResult: func(t *testing.T, docs []*document.Document) {
+				if len(docs) < 2 {
+					t.Fatal("expected at least two documents")
+				}
+				// All documents should have content from fetch server
+				for _, doc := range docs {
+					if !strings.Contains(doc.Content, fetchContent) {
+						t.Errorf("expected content from fetch server, got: %s", doc.Content)
+					}
+					// Metadata should use identifier URL
+					if metaURL, ok := doc.Metadata[source.MetaURL].(string); !ok || !strings.Contains(metaURL, identifierServer.URL) {
+						t.Errorf("expected metadata URL to be identifier URL, got: %v", metaURL)
+					}
+				}
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			src := tt.setupSource()
+			docs, err := src.ReadDocuments(ctx)
+
+			if tt.expectedError {
+				if err == nil {
+					t.Fatal("expected error but got none")
+				}
+				return
+			}
+
+			if err != nil {
+				t.Fatalf("unexpected error: %v", err)
+			}
+
+			tt.validateResult(t, docs)
+		})
+	}
+}

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,14 @@ func WithName(name string) Option {`
`22`	`22`	`}`
`23`	`23`	`}`
`24`	`24`
	`25`	`+// WithContentFetchingURL sets the real content fetching URL for the source.`
	`26`	`+// The real content fetching URL is used to fetch the actual content of the document.`
	`27`	`+func WithContentFetchingURL(url []string) Option {`
	`28`	`+ return func(s *Source) {`
	`29`	`+ s.fetchURLs = url`
	`30`	`+ }`
	`31`	`+}`
	`32`	`+`
`25`	`33`	`// WithMetadata sets additional metadata for the source.`
`26`	`34`	`func WithMetadata(metadata map[string]interface{}) Option {`
`27`	`35`	`return func(s *Source) {`