Skip to content
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
29dea48
WIP(tool): add web fetch
liuzengh Nov 21, 2025
90586a3
add html to markdown support
liuzengh Nov 21, 2025
16542fa
change to ulrs input parameters
liuzengh Nov 21, 2025
00e0732
add conent limit option
liuzengh Nov 21, 2025
b48e364
refactor fetchone
liuzengh Nov 24, 2025
9c58d79
add url filter
liuzengh Nov 24, 2025
d787b37
update comment
liuzengh Nov 24, 2025
556e884
Merge remote-tracking branch 'origin/main' into web-fetch
liuzengh Nov 24, 2025
0e29993
fix typo and add http client test
liuzengh Nov 24, 2025
d114731
Merge remote-tracking branch 'origin' into web-fetch
liuzengh Nov 24, 2025
8d8aed9
improve code coverage
liuzengh Nov 24, 2025
1ae7f69
add todo for server-side websearh
liuzengh Nov 24, 2025
c326483
Merge remote-tracking branch 'origin/main' into web-fetch
liuzengh Nov 24, 2025
d26e5f7
add gemini webfetch
liuzengh Nov 24, 2025
3ae2a9e
Merge remote-tracking branch 'origin/main' into web-fetch
liuzengh Nov 24, 2025
d50cabe
add webfetch/httpfetch example
liuzengh Nov 24, 2025
e2e2bcc
update test
liuzengh Nov 24, 2025
9d0af65
add gemini example
liuzengh Nov 24, 2025
3f8dc49
add pacakge comment
liuzengh Nov 24, 2025
23ae92e
Merge remote-tracking branch 'origin/main' into web-fetch
liuzengh Nov 25, 2025
42ff59e
Merge remote-tracking branch 'origin/main' into web-fetch
liuzengh Nov 25, 2025
6897aa9
add package comments for urlfilter
liuzengh Nov 25, 2025
3b23dfe
add URL filtering tests to fetch_test.go
liuzengh Nov 25, 2025
43c0a20
update geminifetch testcase
liuzengh Nov 25, 2025
d4f29da
Merge remote-tracking branch 'origin/main' into web-fetch
liuzengh Nov 25, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions tool/webfetch/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
module trpc.group/trpc-go/trpc-agent-go/tool/webfetch

go 1.24.4

replace trpc.group/trpc-go/trpc-agent-go => ../../

require (
github.com/JohannesKaufmann/html-to-markdown/v2 v2.4.0
github.com/stretchr/testify v1.10.0
trpc.group/trpc-go/trpc-agent-go v0.0.0
)

require (
github.com/JohannesKaufmann/dom v0.2.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
go.uber.org/multierr v1.10.0 // indirect
go.uber.org/zap v1.27.0 // indirect
golang.org/x/net v0.43.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
trpc.group/trpc-go/trpc-a2a-go v0.2.5-0.20251023030722-7f02b57fd14a // indirect
)
30 changes: 30 additions & 0 deletions tool/webfetch/go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
github.com/JohannesKaufmann/dom v0.2.0 h1:1bragmEb19K8lHAqgFgqCpiPCFEZMTXzOIEjuxkUfLQ=
github.com/JohannesKaufmann/dom v0.2.0/go.mod h1:57iSUl5RKric4bUkgos4zu6Xt5LMHUnw3TF1l5CbGZo=
github.com/JohannesKaufmann/html-to-markdown/v2 v2.4.0 h1:C0/TerKdQX9Y9pbYi1EsLr5LDNANsqunyI/btpyfCg8=
github.com/JohannesKaufmann/html-to-markdown/v2 v2.4.0/go.mod h1:OLaKh+giepO8j7teevrNwiy/fwf8LXgoc9g7rwaE1jk=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/sebdah/goldie/v2 v2.7.1 h1:PkBHymaYdtvEkZV7TmyqKxdmn5/Vcj+8TpATWZjnG5E=
github.com/sebdah/goldie/v2 v2.7.1/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw=
github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/yuin/goldmark v1.7.13 h1:GPddIs617DnBLFFVJFgpo1aBfe/4xcvMc3SB5t/D0pA=
github.com/yuin/goldmark v1.7.13/go.mod h1:ip/1k0VRfGynBgxOz0yCqHrbZXhcjxyuS66Brc7iBKg=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/multierr v1.10.0 h1:S0h4aNzvfcFsC3dRF1jLoaov7oRaKqRGC/pUEJ2yvPQ=
go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8=
go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE=
golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
trpc.group/trpc-go/trpc-a2a-go v0.2.5-0.20251023030722-7f02b57fd14a h1:dOon6HF2sPRFnhCLEiAeKPc21JHL2eX7UBWjIR8PLaY=
trpc.group/trpc-go/trpc-a2a-go v0.2.5-0.20251023030722-7f02b57fd14a/go.mod h1:Gtytau9Uoc3oPo/dpHvKit+tQn9Qlk5XFG1RiZTGqfk=
129 changes: 129 additions & 0 deletions tool/webfetch/urlfilter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
//
// Tencent is pleased to support the open source community by making trpc-agent-go available.
//
// Copyright (C) 2025 Tencent. All rights reserved.
//
// trpc-agent-go is licensed under the Apache License Version 2.0.
//
//

package webfetch

import (
"fmt"
"net/url"
"strings"
)

// URLFilter is a function that determines if a URL should be allowed.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

文件名改为url_filter.go

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

重构下目前的文档结构:后面可以增加 LLM Server-Side 的 WebFetch

webfetch
├── README.md
├── claudefetch
│   └── go.mod
├── geminifetch
│   └── go.mod
├── httpfetch
│   ├── fetch.go
│   ├── fetch_real_test.go
│   ├── fetch_test.go
│   ├── go.mod
│   └── go.sum
└── internal
    └── urlfilter
        ├── filter.go
        └── filter_test.go

// It returns true if the URL is allowed, false otherwise.
type URLFilter func(url string) bool

type urlValidator struct {
filter URLFilter
errMsg string
}

// checkURL checks the URL against the configured validators.
func checkURL(validators []urlValidator, urlStr string) error {
for _, v := range validators {
if !v.filter(urlStr) {
return fmt.Errorf("%s", v.errMsg)
}
}
return nil
}
func newBlockPatternFilter(pattern string) URLFilter {
return func(urlStr string) bool {
u, err := url.Parse(urlStr)
if err != nil {
// Fail-safe: treat unparsable URLs as blocked
return false
}
return !matchPattern(u, pattern)
}
}

func newAllowPatternsFilter(patterns []string) URLFilter {
return func(urlStr string) bool {
u, err := url.Parse(urlStr)
if err != nil {
return false
}
for _, p := range patterns {
if matchPattern(u, p) {
return true
}
}
return false
}
}

// matchPattern checks if the URL matches the given pattern (host + path prefix).
func matchPattern(u *url.URL, pattern string) bool {
// Split pattern into host and path
// Pattern is expected to be like "example.com" or "example.com/foo"
var patternHost, patternPath string
if idx := strings.Index(pattern, "/"); idx != -1 {
patternHost = pattern[:idx]
patternPath = pattern[idx:]
} else {
patternHost = pattern
patternPath = ""
}

// 1. Host match (case-insensitive)
if !matchHost(u.Hostname(), patternHost) {
return false
}

// 2. Path match
if patternPath == "" {
return true
}

// Normalize URL path
uPath := u.Path
if uPath == "" {
uPath = "/"
}
// Ensure absolute path comparison if pattern starts with / (which it does from split)
if !strings.HasPrefix(uPath, "/") {
uPath = "/" + uPath
}

if !strings.HasPrefix(uPath, patternPath) {
return false
}

// Boundary check to avoid "/doc" matching "/docserver"
// Match if:
// - lengths are equal (exact match)
// - pattern ends with '/' (explicit directory match)
// - next char in uPath is '/' (sub-path match)
if len(uPath) == len(patternPath) {
return true
}
if strings.HasSuffix(patternPath, "/") {
return true
}
if uPath[len(patternPath)] == '/' {
return true
}

return false
}

// matchHost checks if hostname matches target domain (exact or suffix).
// e.g., matchHost("www.example.com", "example.com") -> true
func matchHost(hostname, target string) bool {
hostname = strings.ToLower(hostname)
target = strings.ToLower(target)
if hostname == target {
return true
}
if strings.HasSuffix(hostname, "."+target) {
return true
}
return false
}
199 changes: 199 additions & 0 deletions tool/webfetch/urlfilter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
//
// Tencent is pleased to support the open source community by making trpc-agent-go available.
//
// Copyright (C) 2025 Tencent. All rights reserved.
//
// trpc-agent-go is licensed under the Apache License Version 2.0.
//
//

package webfetch

import (
"context"
"fmt"
"net/http"
"net/http/httptest"
"net/url"
"strings"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestWebFetch_DomainFiltering(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, "OK")
}))
defer ts.Close()

// ts.URL looks like http://127.0.0.1:xxxxx
// We'll use 127.0.0.1 for filtering tests.

t.Run("AllowedDomains", func(t *testing.T) {
tool := NewTool(WithAllowedDomains([]string{"127.0.0.1"}))
args := fmt.Sprintf(`{"urls": ["%s"]}`, ts.URL)

res, err := tool.Call(context.Background(), []byte(args))
require.NoError(t, err)
resp := res.(fetchResponse)
assert.Len(t, resp.Results, 1)
assert.Equal(t, http.StatusOK, resp.Results[0].StatusCode)
assert.Empty(t, resp.Results[0].Error)
})

t.Run("AllowedDomains_Blocked", func(t *testing.T) {
tool := NewTool(WithAllowedDomains([]string{"example.com"})) // 127.0.0.1 not allowed
args := fmt.Sprintf(`{"urls": ["%s"]}`, ts.URL)

res, err := tool.Call(context.Background(), []byte(args))
require.NoError(t, err)
resp := res.(fetchResponse)
assert.Len(t, resp.Results, 1)
assert.Contains(t, resp.Results[0].Error, "does not match any allowed pattern")
})

t.Run("BlockedDomains", func(t *testing.T) {
tool := NewTool(WithBlockedDomains([]string{"127.0.0.1"}))
args := fmt.Sprintf(`{"urls": ["%s"]}`, ts.URL)

res, err := tool.Call(context.Background(), []byte(args))
require.NoError(t, err)
resp := res.(fetchResponse)
assert.Len(t, resp.Results, 1)
assert.Contains(t, resp.Results[0].Error, "matches blocked pattern")
})

t.Run("AllowedDomains_Subpath", func(t *testing.T) {
// ts.URL + "/docs" allowed
// ts.URL + "/admin" not allowed
// Since ts.URL contains "127.0.0.1:port", we construct pattern "127.0.0.1:port/docs" ?
// Wait, matchHost logic ignores port in hostname, but matchPattern splits by first slash.
// If pattern is "example.com/foo", host="example.com".
// u.Hostname() returns hostname without port.
// matchHost compares hostname only.
// So if pattern="127.0.0.1/docs", it matches host "127.0.0.1".
// The URL "http://127.0.0.1:port/docs" has host "127.0.0.1".
// Path is "/docs". matchPattern checks path "/docs" against pattern path "/docs". Match.

tool := NewTool(WithAllowedDomains([]string{"127.0.0.1/docs"}))

// Allowed path
argsOK := fmt.Sprintf(`{"urls": ["%s/docs/page1"]}`, ts.URL)
resOK, errOK := tool.Call(context.Background(), []byte(argsOK))
require.NoError(t, errOK)
respOK := resOK.(fetchResponse)
assert.Empty(t, respOK.Results[0].Error, "Should allow /docs/page1")

// Blocked path
argsBlock := fmt.Sprintf(`{"urls": ["%s/admin"]}`, ts.URL)
resBlock, errBlock := tool.Call(context.Background(), []byte(argsBlock))
require.NoError(t, errBlock)
respBlock := resBlock.(fetchResponse)
assert.Contains(t, respBlock.Results[0].Error, "not match any allowed pattern", "Should block /admin")
})

t.Run("BlockedDomains_Subpath", func(t *testing.T) {
tool := NewTool(WithBlockedDomains([]string{"127.0.0.1/private"}))

// Allowed path (not blocked)
argsOK := fmt.Sprintf(`{"urls": ["%s/public"]}`, ts.URL)
resOK, errOK := tool.Call(context.Background(), []byte(argsOK))
require.NoError(t, errOK)
respOK := resOK.(fetchResponse)
assert.Empty(t, respOK.Results[0].Error, "Should allow /public")

// Blocked path
argsBlock := fmt.Sprintf(`{"urls": ["%s/private/secret"]}`, ts.URL)
resBlock, errBlock := tool.Call(context.Background(), []byte(argsBlock))
require.NoError(t, errBlock)
respBlock := resBlock.(fetchResponse)
assert.Contains(t, respBlock.Results[0].Error, "matches blocked pattern", "Should block /private/secret")
})

t.Run("CustomURLFilter", func(t *testing.T) {
// Filter that only allows paths containing "secure"
filter := func(u string) bool {
return strings.Contains(u, "secure")
}
tool := NewTool(WithURLFilter(filter))

// Allowed path
argsOK := fmt.Sprintf(`{"urls": ["%s/secure/page"]}`, ts.URL)
resOK, errOK := tool.Call(context.Background(), []byte(argsOK))
require.NoError(t, errOK)
respOK := resOK.(fetchResponse)
assert.Empty(t, respOK.Results[0].Error, "Should allow /secure/page")

// Blocked path
argsBlock := fmt.Sprintf(`{"urls": ["%s/unsafe/page"]}`, ts.URL)
resBlock, errBlock := tool.Call(context.Background(), []byte(argsBlock))
require.NoError(t, errBlock)
respBlock := resBlock.(fetchResponse)
assert.Contains(t, respBlock.Results[0].Error, "rejected by custom filter", "Should block /unsafe/page")
})
}

func TestMatchHost(t *testing.T) {
tests := []struct {
host string
target string
want bool
}{
{"example.com", "example.com", true},
{"www.example.com", "example.com", true},
{"sub.www.example.com", "example.com", true},
{"example.com", "google.com", false},
{"notexample.com", "example.com", false}, // Suffix but not dot separator
{"example.com.evil.com", "example.com", false},
}

for _, tt := range tests {
got := matchHost(tt.host, tt.target)
assert.Equal(t, tt.want, got, "matchHost(%q, %q)", tt.host, tt.target)
}
}

func TestMatchPattern(t *testing.T) {
// Helper to create URL
parse := func(s string) *url.URL {
u, _ := url.Parse(s)
return u
}

tests := []struct {
urlStr string
pattern string
want bool
}{
// Host only
{"http://example.com", "example.com", true},
{"http://www.example.com", "example.com", true},
{"http://google.com", "example.com", false},

// Host + Path
{"http://example.com/docs", "example.com/docs", true},
{"http://example.com/docs/api", "example.com/docs", true},
{"http://example.com/other", "example.com/docs", false},
{"http://example.com/docserver", "example.com/docs", false}, // boundary check
{"http://example.com", "example.com/docs", false}, // path too short

// Subdomains
{"http://www.example.com/docs", "example.com/docs", true},

// Trailing slash in pattern
{"http://example.com/docs/", "example.com/docs/", true},
{"http://example.com/docs", "example.com/docs/", false}, // url path missing slash
{"http://example.com/docs/api", "example.com/docs/", true},

// Trailing slash in URL
{"http://example.com/docs/", "example.com/docs", true},
}

for _, tt := range tests {
u := parse(tt.urlStr)
got := matchPattern(u, tt.pattern)
assert.Equal(t, tt.want, got, "matchPattern(%q, %q)", tt.urlStr, tt.pattern)
}
}
Loading
Loading