Skip to content

Commit ab21c33

Browse files
authored
knowledge: add url source alias option (#427)
1 parent 0176b10 commit ab21c33

File tree

7 files changed

+224
-40
lines changed

7 files changed

+224
-40
lines changed

docs/mkdocs/en/knowledge.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,17 @@ urlSrc := urlsource.New(
348348
urlsource.WithName("Web Content"),
349349
)
350350

351+
// URL source advanced configuration: Separate content fetching and document identification
352+
urlSrcAlias := urlsource.New(
353+
[]string{"https://trpc-go.com/docs/api.md"}, // Identifier URL (for document ID and metadata)
354+
urlsource.WithContentFetchingURL([]string{"https://github.com/trpc-group/trpc-go/raw/main/docs/api.md"}), // Actual content fetching URL
355+
urlsource.WithName("TRPC API Docs"),
356+
urlsource.WithMetadataValue("source", "github"),
357+
)
358+
// Note: When using WithContentFetchingURL, the identifier URL should preserve the file information from the content fetching URL, for example:
359+
// Correct: Identifier URL is https://trpc-go.com/docs/api.md, fetch URL is https://github.com/.../docs/api.md
360+
// Incorrect: Identifier URL is https://trpc-go.com, which loses document path information
361+
351362
// Auto source: Intelligent type recognition, automatically select processor.
352363
autoSrc := autosource.New(
353364
[]string{

docs/mkdocs/zh/knowledge.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,17 @@ urlSrc := urlsource.New(
346346
urlsource.WithName("Web Content"),
347347
)
348348

349+
// URL 源高级配置:分离内容获取和文档标识
350+
urlSrcAlias := urlsource.New(
351+
[]string{"https://trpc-go.com/docs/api.md"}, // 标识符 URL(用于文档 ID 和元数据)
352+
urlsource.WithContentFetchingURL([]string{"https://github.com/trpc-group/trpc-go/raw/main/docs/api.md"}), // 实际内容获取 URL
353+
urlsource.WithName("TRPC API Docs"),
354+
urlsource.WithMetadataValue("source", "github"),
355+
)
356+
// 注意:使用 WithContentFetchingURL 时,标识符 URL 应保留获取内容的URL的文件信息,比如
357+
// 正确:标识符 URL 为 https://trpc-go.com/docs/api.md,获取 URL 为 https://github.com/.../docs/api.md
358+
// 错误:标识符 URL 为 https://trpc-go.com,会丢失文档路径信息
359+
349360
// 自动源:智能识别类型,自动选择处理器
350361
autoSrc := autosource.New(
351362
[]string{

examples/knowledge/management/README.md

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -331,11 +331,28 @@ Source: GolangDocSource
331331
332332
### Default Sources
333333
334-
The application automatically loads these default sources from the `data/` directory:
335-
- `golang.md`: Go programming language documentation
336-
- `llm.md`: Large language model concepts and terminology
337-
338-
The system supports any text-based formats including:
339-
- Markdown (.md)
340-
- Text files (.txt)
341-
- Other text-based formats with proper parsing
334+
The application automatically loads these default sources:
335+
336+
#### File Sources (from `../exampledata/file/` directory):
337+
- **LLMDocSource** (`llm.md`): Large language model concepts and terminology
338+
- Metadata: `tag: "llm"`
339+
- **GolangDocSource** (`golang.md`): Go programming language documentation
340+
- Metadata: `tag: "golang"`
341+
342+
#### URL Sources:
343+
- **Byte-pair** (urlSource1): Wikipedia article on Byte-pair encoding
344+
- URL: `https://en.wikipedia.org/wiki/Byte-pair_encoding`
345+
- Metadata: `tag: "wiki"`
346+
- Note: Direct URL fetching - same URL used for both content fetching and document ID generation
347+
348+
- **trpc-go** (urlSource2): Demonstrates URL source with separate fetch and identifier URLs
349+
- Identifier URL: `https://trpc-go.com/Byte-pair_encoding` (used for metadata and document ID)
350+
- Content Fetch URL: `https://en.wikipedia.org/wiki/Byte-pair_encoding` (actual content source)
351+
- Metadata: `tag: "wiki"`
352+
- Note: This example shows how to use `WithContentFetchingURL()` to specify different URLs for content fetching vs. document identification
353+
354+
**Notice**: When using `WithContentFetchingURL()`, the identifier URL should maintain the same path structure as the fetch URL:
355+
- ✅ **Correct**: If fetch URL is `http://example.com/hello.md`, identifier URL should be `http://trpc-go.com/hello.md`
356+
- ❌ **Incorrect**: Using `http://trpc-go.com` as identifier URL loses the document path information
357+
- This ensures proper document identification and metadata consistency
358+

examples/knowledge/management/main.go

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
geminiembedder "trpc.group/trpc-go/trpc-agent-go/knowledge/embedder/gemini"
2828
openaiembedder "trpc.group/trpc-go/trpc-agent-go/knowledge/embedder/openai"
2929
"trpc.group/trpc-go/trpc-agent-go/knowledge/source/file"
30+
"trpc.group/trpc-go/trpc-agent-go/knowledge/source/url"
3031

3132
// Source.
3233
"trpc.group/trpc-go/trpc-agent-go/knowledge/source"
@@ -446,14 +447,24 @@ func (chat *knowledgeChat) setupKnowledgeBase() error {
446447
file.WithName("GolangDocSource"),
447448
file.WithMetadata(map[string]interface{}{"tag": "golang"}),
448449
)
450+
urlSource1 := url.New(
451+
[]string{"https://en.wikipedia.org/wiki/Byte-pair_encoding"},
452+
url.WithName("Byte-pair"),
453+
url.WithMetadataValue("tag", "wiki"),
454+
)
455+
urlSource2 := url.New(
456+
[]string{"https://trpc-go.com/Byte-pair_encoding"}, // contentFetchURL is configured, this url will be used to generate meta data and docID
457+
url.WithName("trpc-go"),
458+
url.WithContentFetchingURL([]string{"https://en.wikipedia.org/wiki/Byte-pair_encoding"}), // real url that fetching data
459+
url.WithMetadataValue("tag", "wiki"),
460+
)
449461

450-
chat.source = []source.Source{fileSource1, fileSource2}
451-
462+
chat.source = []source.Source{fileSource1, fileSource2, urlSource1, urlSource2}
452463
// Create knowledge base
453464
chat.knowledge = knowledge.New(
454465
knowledge.WithEmbedder(embedder),
455466
knowledge.WithVectorStore(vs),
456-
knowledge.WithSources([]source.Source{fileSource1, fileSource2}),
467+
knowledge.WithSources([]source.Source{fileSource1, fileSource2, urlSource1, urlSource2}),
457468
knowledge.WithEnableSourceSync(*sourceSync),
458469
)
459470

knowledge/source/url/options.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,14 @@ func WithName(name string) Option {
2222
}
2323
}
2424

25+
// WithContentFetchingURL sets the real content fetching URL for the source.
26+
// The real content fetching URL is used to fetch the actual content of the document.
27+
func WithContentFetchingURL(url []string) Option {
28+
return func(s *Source) {
29+
s.fetchURLs = url
30+
}
31+
}
32+
2533
// WithMetadata sets additional metadata for the source.
2634
func WithMetadata(metadata map[string]interface{}) Option {
2735
return func(s *Source) {

knowledge/source/url/url_source.go

Lines changed: 42 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -34,24 +34,25 @@ var defaultClient = &http.Client{Timeout: 30 * time.Second}
3434

3535
// Source represents a knowledge source for URL-based content.
3636
type Source struct {
37-
urls []string
38-
name string
39-
metadata map[string]interface{}
40-
readers map[string]reader.Reader
41-
httpClient *http.Client
42-
chunkSize int
43-
chunkOverlap int
37+
identifierURLs []string // url, used to generate document ID and check update of document.
38+
fetchURLs []string // fetching url , the actual used to fetch content.
39+
name string
40+
metadata map[string]interface{}
41+
readers map[string]reader.Reader
42+
httpClient *http.Client
43+
chunkSize int
44+
chunkOverlap int
4445
}
4546

4647
// New creates a new URL knowledge source.
4748
func New(urls []string, opts ...Option) *Source {
4849
s := &Source{
49-
urls: urls,
50-
name: defaultURLSourceName,
51-
metadata: make(map[string]interface{}),
52-
httpClient: defaultClient,
53-
chunkSize: 0,
54-
chunkOverlap: 0,
50+
identifierURLs: urls,
51+
name: defaultURLSourceName,
52+
metadata: make(map[string]interface{}),
53+
httpClient: defaultClient,
54+
chunkSize: 0,
55+
chunkOverlap: 0,
5556
}
5657

5758
// Apply options first (capture chunk config).
@@ -69,16 +70,24 @@ func New(urls []string, opts ...Option) *Source {
6970

7071
// ReadDocuments downloads content from all URLs and returns documents using appropriate readers.
7172
func (s *Source) ReadDocuments(ctx context.Context) ([]*document.Document, error) {
72-
if len(s.urls) == 0 {
73+
if len(s.identifierURLs) == 0 {
7374
return nil, nil // Skip if no URLs provided.
7475
}
7576

77+
if len(s.fetchURLs) > 0 && len(s.identifierURLs) != len(s.fetchURLs) {
78+
return nil, fmt.Errorf("fetchURLs and urls must have the same count")
79+
}
80+
7681
var allDocuments []*document.Document
7782

78-
for _, urlStr := range s.urls {
79-
documents, err := s.processURL(urlStr)
83+
for i, identifierURL := range s.identifierURLs {
84+
fetchingURL := identifierURL
85+
if len(s.fetchURLs) > 0 {
86+
fetchingURL = s.fetchURLs[i]
87+
}
88+
documents, err := s.processURL(fetchingURL, identifierURL)
8089
if err != nil {
81-
return nil, fmt.Errorf("failed to process URL %s: %w", urlStr, err)
90+
return nil, fmt.Errorf("failed to process URL %s: %w", identifierURL, err)
8291
}
8392
allDocuments = append(allDocuments, documents...)
8493
}
@@ -97,15 +106,21 @@ func (s *Source) Type() string {
97106
}
98107

99108
// processURL downloads content from a URL and returns its documents.
100-
func (s *Source) processURL(urlStr string) ([]*document.Document, error) {
109+
func (s *Source) processURL(fetchingURL string, identifierURL string) ([]*document.Document, error) {
101110
// Parse the URL.
102-
parsedURL, err := url.Parse(urlStr)
111+
_, err := url.Parse(fetchingURL)
112+
if err != nil {
113+
return nil, fmt.Errorf("failed to parse fetching URL: %w", err)
114+
}
115+
116+
// Parse and validate the identifier URL.
117+
parsedIdentifierURL, err := url.Parse(identifierURL)
103118
if err != nil {
104-
return nil, fmt.Errorf("failed to parse URL: %w", err)
119+
return nil, fmt.Errorf("failed to parse identifier URL: %w", err)
105120
}
106121

107122
// Create HTTP request with context.
108-
req, err := http.NewRequest("GET", urlStr, nil)
123+
req, err := http.NewRequest("GET", fetchingURL, nil)
109124
if err != nil {
110125
return nil, fmt.Errorf("failed to create request: %w", err)
111126
}
@@ -126,8 +141,7 @@ func (s *Source) processURL(urlStr string) ([]*document.Document, error) {
126141

127142
// Determine the content type and file name.
128143
contentType := resp.Header.Get("Content-Type")
129-
fileName := s.getFileName(parsedURL, contentType)
130-
144+
fileName := s.getFileName(parsedIdentifierURL, contentType)
131145
// Determine file type and get appropriate reader.
132146
fileType := isource.GetFileTypeFromContentType(contentType, fileName)
133147
reader, exists := s.readers[fileType]
@@ -147,12 +161,11 @@ func (s *Source) processURL(urlStr string) ([]*document.Document, error) {
147161
metadata[k] = v
148162
}
149163
metadata[source.MetaSource] = source.TypeURL
150-
metadata[source.MetaURL] = urlStr
151-
metadata[source.MetaURLHost] = parsedURL.Host
152-
metadata[source.MetaURLPath] = parsedURL.Path
153-
metadata[source.MetaURLScheme] = parsedURL.Scheme
154-
155-
metadata[source.MetaURI] = urlStr
164+
metadata[source.MetaURL] = identifierURL
165+
metadata[source.MetaURLHost] = parsedIdentifierURL.Host
166+
metadata[source.MetaURLPath] = parsedIdentifierURL.Path
167+
metadata[source.MetaURLScheme] = parsedIdentifierURL.Scheme
168+
metadata[source.MetaURI] = identifierURL
156169
metadata[source.MetaSourceName] = s.name
157170

158171
// Add metadata to all documents.

knowledge/source/url/url_source_test.go

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"strings"
1818
"testing"
1919

20+
"trpc.group/trpc-go/trpc-agent-go/knowledge/document"
2021
"trpc.group/trpc-go/trpc-agent-go/knowledge/source"
2122
)
2223

@@ -189,3 +190,115 @@ func TestSetMetadataMultiple(t *testing.T) {
189190
}
190191
}
191192
}
193+
194+
// TestWithContentFetchingURL verifies the WithContentFetchingURL option functionality.
195+
func TestWithContentFetchingURL(t *testing.T) {
196+
ctx := context.Background()
197+
198+
// Content for different servers
199+
identifierContent := "This is identifier content"
200+
fetchContent := "This is fetch content"
201+
202+
// Create identifier server
203+
identifierServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
204+
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
205+
_, _ = w.Write([]byte(identifierContent))
206+
}))
207+
defer identifierServer.Close()
208+
209+
// Create fetch server
210+
fetchServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
211+
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
212+
_, _ = w.Write([]byte(fetchContent))
213+
}))
214+
defer fetchServer.Close()
215+
216+
tests := []struct {
217+
name string
218+
setupSource func() *Source
219+
expectedError bool
220+
validateResult func(t *testing.T, docs []*document.Document)
221+
}{
222+
{
223+
name: "basic_content_fetching_url",
224+
setupSource: func() *Source {
225+
return New(
226+
[]string{identifierServer.URL + "/doc.txt"},
227+
WithContentFetchingURL([]string{fetchServer.URL + "/doc.txt"}),
228+
)
229+
},
230+
expectedError: false,
231+
validateResult: func(t *testing.T, docs []*document.Document) {
232+
if len(docs) == 0 {
233+
t.Fatal("expected at least one document")
234+
}
235+
// Content should come from fetch server
236+
if !strings.Contains(docs[0].Content, fetchContent) {
237+
t.Errorf("expected content from fetch server, got: %s", docs[0].Content)
238+
}
239+
// Metadata should use identifier URL
240+
if metaURL, ok := docs[0].Metadata[source.MetaURL].(string); !ok || !strings.Contains(metaURL, identifierServer.URL) {
241+
t.Errorf("expected metadata URL to be identifier URL, got: %v", metaURL)
242+
}
243+
},
244+
},
245+
{
246+
name: "mismatched_url_count",
247+
setupSource: func() *Source {
248+
return New(
249+
[]string{identifierServer.URL + "/doc1.txt", identifierServer.URL + "/doc2.txt"},
250+
WithContentFetchingURL([]string{fetchServer.URL + "/doc1.txt"}), // Only one fetch URL for two identifier URLs
251+
)
252+
},
253+
expectedError: true,
254+
validateResult: func(t *testing.T, docs []*document.Document) {
255+
// Should not reach here due to error
256+
},
257+
},
258+
{
259+
name: "multiple_urls_with_fetching",
260+
setupSource: func() *Source {
261+
return New(
262+
[]string{identifierServer.URL + "/doc1.txt", identifierServer.URL + "/doc2.txt"},
263+
WithContentFetchingURL([]string{fetchServer.URL + "/doc1.txt", fetchServer.URL + "/doc2.txt"}),
264+
)
265+
},
266+
expectedError: false,
267+
validateResult: func(t *testing.T, docs []*document.Document) {
268+
if len(docs) < 2 {
269+
t.Fatal("expected at least two documents")
270+
}
271+
// All documents should have content from fetch server
272+
for _, doc := range docs {
273+
if !strings.Contains(doc.Content, fetchContent) {
274+
t.Errorf("expected content from fetch server, got: %s", doc.Content)
275+
}
276+
// Metadata should use identifier URL
277+
if metaURL, ok := doc.Metadata[source.MetaURL].(string); !ok || !strings.Contains(metaURL, identifierServer.URL) {
278+
t.Errorf("expected metadata URL to be identifier URL, got: %v", metaURL)
279+
}
280+
}
281+
},
282+
},
283+
}
284+
285+
for _, tt := range tests {
286+
t.Run(tt.name, func(t *testing.T) {
287+
src := tt.setupSource()
288+
docs, err := src.ReadDocuments(ctx)
289+
290+
if tt.expectedError {
291+
if err == nil {
292+
t.Fatal("expected error but got none")
293+
}
294+
return
295+
}
296+
297+
if err != nil {
298+
t.Fatalf("unexpected error: %v", err)
299+
}
300+
301+
tt.validateResult(t, docs)
302+
})
303+
}
304+
}

0 commit comments

Comments
 (0)