@@ -34,24 +34,25 @@ var defaultClient = &http.Client{Timeout: 30 * time.Second}
3434
3535// Source represents a knowledge source for URL-based content.
3636type Source struct {
37- urls []string
38- name string
39- metadata map [string ]interface {}
40- readers map [string ]reader.Reader
41- httpClient * http.Client
42- chunkSize int
43- chunkOverlap int
37+ identifierURLs []string // url, used to generate document ID and check update of document.
38+ fetchURLs []string // fetching url , the actual used to fetch content.
39+ name string
40+ metadata map [string ]interface {}
41+ readers map [string ]reader.Reader
42+ httpClient * http.Client
43+ chunkSize int
44+ chunkOverlap int
4445}
4546
4647// New creates a new URL knowledge source.
4748func New (urls []string , opts ... Option ) * Source {
4849 s := & Source {
49- urls : urls ,
50- name : defaultURLSourceName ,
51- metadata : make (map [string ]interface {}),
52- httpClient : defaultClient ,
53- chunkSize : 0 ,
54- chunkOverlap : 0 ,
50+ identifierURLs : urls ,
51+ name : defaultURLSourceName ,
52+ metadata : make (map [string ]interface {}),
53+ httpClient : defaultClient ,
54+ chunkSize : 0 ,
55+ chunkOverlap : 0 ,
5556 }
5657
5758 // Apply options first (capture chunk config).
@@ -69,16 +70,24 @@ func New(urls []string, opts ...Option) *Source {
6970
7071// ReadDocuments downloads content from all URLs and returns documents using appropriate readers.
7172func (s * Source ) ReadDocuments (ctx context.Context ) ([]* document.Document , error ) {
72- if len (s .urls ) == 0 {
73+ if len (s .identifierURLs ) == 0 {
7374 return nil , nil // Skip if no URLs provided.
7475 }
7576
77+ if len (s .fetchURLs ) > 0 && len (s .identifierURLs ) != len (s .fetchURLs ) {
78+ return nil , fmt .Errorf ("fetchURLs and urls must have the same count" )
79+ }
80+
7681 var allDocuments []* document.Document
7782
78- for _ , urlStr := range s .urls {
79- documents , err := s .processURL (urlStr )
83+ for i , identifierURL := range s .identifierURLs {
84+ fetchingURL := identifierURL
85+ if len (s .fetchURLs ) > 0 {
86+ fetchingURL = s .fetchURLs [i ]
87+ }
88+ documents , err := s .processURL (fetchingURL , identifierURL )
8089 if err != nil {
81- return nil , fmt .Errorf ("failed to process URL %s: %w" , urlStr , err )
90+ return nil , fmt .Errorf ("failed to process URL %s: %w" , identifierURL , err )
8291 }
8392 allDocuments = append (allDocuments , documents ... )
8493 }
@@ -97,15 +106,21 @@ func (s *Source) Type() string {
97106}
98107
99108// processURL downloads content from a URL and returns its documents.
100- func (s * Source ) processURL (urlStr string ) ([]* document.Document , error ) {
109+ func (s * Source ) processURL (fetchingURL string , identifierURL string ) ([]* document.Document , error ) {
101110 // Parse the URL.
102- parsedURL , err := url .Parse (urlStr )
111+ _ , err := url .Parse (fetchingURL )
112+ if err != nil {
113+ return nil , fmt .Errorf ("failed to parse fetching URL: %w" , err )
114+ }
115+
116+ // Parse and validate the identifier URL.
117+ parsedIdentifierURL , err := url .Parse (identifierURL )
103118 if err != nil {
104- return nil , fmt .Errorf ("failed to parse URL: %w" , err )
119+ return nil , fmt .Errorf ("failed to parse identifier URL: %w" , err )
105120 }
106121
107122 // Create HTTP request with context.
108- req , err := http .NewRequest ("GET" , urlStr , nil )
123+ req , err := http .NewRequest ("GET" , fetchingURL , nil )
109124 if err != nil {
110125 return nil , fmt .Errorf ("failed to create request: %w" , err )
111126 }
@@ -126,8 +141,7 @@ func (s *Source) processURL(urlStr string) ([]*document.Document, error) {
126141
127142 // Determine the content type and file name.
128143 contentType := resp .Header .Get ("Content-Type" )
129- fileName := s .getFileName (parsedURL , contentType )
130-
144+ fileName := s .getFileName (parsedIdentifierURL , contentType )
131145 // Determine file type and get appropriate reader.
132146 fileType := isource .GetFileTypeFromContentType (contentType , fileName )
133147 reader , exists := s .readers [fileType ]
@@ -147,12 +161,11 @@ func (s *Source) processURL(urlStr string) ([]*document.Document, error) {
147161 metadata [k ] = v
148162 }
149163 metadata [source .MetaSource ] = source .TypeURL
150- metadata [source .MetaURL ] = urlStr
151- metadata [source .MetaURLHost ] = parsedURL .Host
152- metadata [source .MetaURLPath ] = parsedURL .Path
153- metadata [source .MetaURLScheme ] = parsedURL .Scheme
154-
155- metadata [source .MetaURI ] = urlStr
164+ metadata [source .MetaURL ] = identifierURL
165+ metadata [source .MetaURLHost ] = parsedIdentifierURL .Host
166+ metadata [source .MetaURLPath ] = parsedIdentifierURL .Path
167+ metadata [source .MetaURLScheme ] = parsedIdentifierURL .Scheme
168+ metadata [source .MetaURI ] = identifierURL
156169 metadata [source .MetaSourceName ] = s .name
157170
158171 // Add metadata to all documents.
0 commit comments