Skip to content

Commit 8a57c4f

Browse files
Merge pull request #1710 from projectdiscovery/add-urlscan-source
feat: add URLScan.io as passive subdomain source
2 parents 0b51762 + 58c1e5a commit 8a57c4f

File tree

7 files changed

+226
-2
lines changed

7 files changed

+226
-2
lines changed

.github/workflows/build-test.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ jobs:
6565
SECURITYTRAILS_API_KEY: ${{secrets.SECURITYTRAILS_API_KEY}}
6666
SHODAN_API_KEY: ${{secrets.SHODAN_API_KEY}}
6767
THREATBOOK_API_KEY: ${{secrets.THREATBOOK_API_KEY}}
68+
URLSCAN_API_KEY: ${{secrets.URLSCAN_API_KEY}}
6869
VIRUSTOTAL_API_KEY: ${{secrets.VIRUSTOTAL_API_KEY}}
6970
WHOISXMLAPI_API_KEY: ${{secrets.WHOISXMLAPI_API_KEY}}
7071
ZOOMEYEAPI_API_KEY: ${{secrets.ZOOMEYEAPI_API_KEY}}

pkg/passive/sources.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ import (
5353
"github.com/projectdiscovery/subfinder/v2/pkg/subscraping/sources/thc"
5454
"github.com/projectdiscovery/subfinder/v2/pkg/subscraping/sources/threatbook"
5555
"github.com/projectdiscovery/subfinder/v2/pkg/subscraping/sources/threatcrowd"
56+
"github.com/projectdiscovery/subfinder/v2/pkg/subscraping/sources/urlscan"
5657
"github.com/projectdiscovery/subfinder/v2/pkg/subscraping/sources/virustotal"
5758
"github.com/projectdiscovery/subfinder/v2/pkg/subscraping/sources/waybackarchive"
5859
"github.com/projectdiscovery/subfinder/v2/pkg/subscraping/sources/whoisxmlapi"
@@ -114,6 +115,7 @@ var AllSources = [...]subscraping.Source{
114115
&hudsonrock.Source{},
115116
&digitalyama.Source{},
116117
&thc.Source{},
118+
&urlscan.Source{},
117119
}
118120

119121
var sourceWarnings = mapsutil.NewSyncLockMap[string, string](

pkg/passive/sources_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ var (
6363
"digitalyama",
6464
"merklemap",
6565
"thc",
66+
"urlscan",
6667
}
6768

6869
expectedDefaultSources = []string{
@@ -105,6 +106,7 @@ var (
105106
"builtwith",
106107
"digitalyama",
107108
"thc",
109+
"urlscan",
108110
}
109111

110112
expectedDefaultRecursiveSources = []string{
@@ -121,6 +123,7 @@ var (
121123
"leakix",
122124
"facebook",
123125
"merklemap",
126+
"urlscan",
124127
// "reconcloud",
125128
}
126129
)

pkg/passive/sources_wo_auth_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ func TestSourcesWithoutKeys(t *testing.T) {
3434
"dnsdumpster", // failing with "unexpected status code 403 received"
3535
"anubis", // failing with "too many redirects"
3636
"threatcrowd", // failing with "randomly failing with unmarshal error when hit multiple times"
37+
"leakix", // now requires API key (returns 401)
38+
"reconeer", // now requires API key (returns 401)
39+
"sitedossier", // flaky - returns no results in CI
3740
}
3841

3942
domain := "hackerone.com"

pkg/runner/options.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,4 +265,5 @@ var defaultRateLimits = []string{
265265
// "gitlab=2/s",
266266
"github=83/m",
267267
"hudsonrock=5/s",
268+
"urlscan=1/s",
268269
}

pkg/subscraping/sources/facebook/ctlogs.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ import (
1010
"github.com/projectdiscovery/gologger"
1111
"github.com/projectdiscovery/retryablehttp-go"
1212
"github.com/projectdiscovery/subfinder/v2/pkg/subscraping"
13-
errorutil "github.com/projectdiscovery/utils/errors"
1413
"github.com/projectdiscovery/utils/generic"
1514
urlutil "github.com/projectdiscovery/utils/url"
1615
)
@@ -128,7 +127,7 @@ func (s *Source) Run(ctx context.Context, domain string, session *subscraping.Se
128127
response := &response{}
129128
if err := json.Unmarshal(bin, response); err != nil {
130129
s.errors++
131-
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: errorutil.NewWithErr(err).Msgf("failed to unmarshal response: %s", string(bin))}
130+
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: fmt.Errorf("failed to unmarshal response: %s: %w", string(bin), err)}
132131
return
133132
}
134133
for _, v := range response.Data {
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
// Package urlscan logic
2+
package urlscan
3+
4+
import (
5+
"context"
6+
"fmt"
7+
"net/url"
8+
"strings"
9+
"time"
10+
11+
jsoniter "github.com/json-iterator/go"
12+
13+
"github.com/projectdiscovery/subfinder/v2/pkg/subscraping"
14+
)
15+
16+
const (
17+
// baseURL is the URLScan API base URL
18+
baseURL = "https://urlscan.io/api/v1/search/"
19+
// maxPages is the maximum number of pages to fetch
20+
maxPages = 5
21+
// maxPerPage is the maximum results per page (URLScan max is 10000, but 100 is safer)
22+
maxPerPage = 100
23+
)
24+
25+
// response represents the URLScan API response structure
26+
type response struct {
27+
Results []struct {
28+
Task struct {
29+
Domain string `json:"domain"`
30+
URL string `json:"url"`
31+
} `json:"task"`
32+
Page struct {
33+
Domain string `json:"domain"`
34+
URL string `json:"url"`
35+
} `json:"page"`
36+
Sort []interface{} `json:"sort"`
37+
} `json:"results"`
38+
HasMore bool `json:"has_more"`
39+
Total int `json:"total"`
40+
}
41+
42+
// Source is the passive scraping agent
43+
type Source struct {
44+
apiKeys []string
45+
timeTaken time.Duration
46+
errors int
47+
results int
48+
requests int
49+
skipped bool
50+
}
51+
52+
// Run function returns all subdomains found with the service
53+
func (s *Source) Run(ctx context.Context, domain string, session *subscraping.Session) <-chan subscraping.Result {
54+
results := make(chan subscraping.Result)
55+
s.errors = 0
56+
s.results = 0
57+
s.requests = 0
58+
s.skipped = false
59+
60+
go func() {
61+
defer func(startTime time.Time) {
62+
s.timeTaken = time.Since(startTime)
63+
close(results)
64+
}(time.Now())
65+
66+
randomApiKey := subscraping.PickRandom(s.apiKeys, s.Name())
67+
if randomApiKey == "" {
68+
s.skipped = true
69+
return
70+
}
71+
72+
headers := map[string]string{"api-key": randomApiKey}
73+
74+
// Search with wildcard to get more subdomain results
75+
s.enumerate(ctx, domain, headers, session, results)
76+
}()
77+
78+
return results
79+
}
80+
81+
// enumerate performs the actual enumeration with pagination
82+
func (s *Source) enumerate(ctx context.Context, domain string, headers map[string]string, session *subscraping.Session, results chan subscraping.Result) {
83+
var searchAfter string
84+
currentPage := 0
85+
86+
for {
87+
select {
88+
case <-ctx.Done():
89+
return
90+
default:
91+
}
92+
93+
if currentPage >= maxPages {
94+
break
95+
}
96+
97+
// Build search URL
98+
searchURL := fmt.Sprintf("%s?q=domain:%s&size=%d", baseURL, url.QueryEscape(domain), maxPerPage)
99+
if searchAfter != "" {
100+
searchURL += "&search_after=" + url.QueryEscape(searchAfter)
101+
}
102+
103+
s.requests++
104+
resp, err := session.Get(ctx, searchURL, "", headers)
105+
if err != nil {
106+
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
107+
s.errors++
108+
session.DiscardHTTPResponse(resp)
109+
return
110+
}
111+
112+
var data response
113+
err = jsoniter.NewDecoder(resp.Body).Decode(&data)
114+
if err != nil {
115+
results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
116+
s.errors++
117+
session.DiscardHTTPResponse(resp)
118+
return
119+
}
120+
session.DiscardHTTPResponse(resp)
121+
122+
// Process results - extract subdomains from multiple fields
123+
for _, result := range data.Results {
124+
candidates := []string{
125+
result.Task.Domain,
126+
result.Page.Domain,
127+
}
128+
129+
// Also extract from URLs if present
130+
if result.Task.URL != "" {
131+
if u, err := url.Parse(result.Task.URL); err == nil {
132+
candidates = append(candidates, u.Hostname())
133+
}
134+
}
135+
if result.Page.URL != "" {
136+
if u, err := url.Parse(result.Page.URL); err == nil {
137+
candidates = append(candidates, u.Hostname())
138+
}
139+
}
140+
141+
for _, candidate := range candidates {
142+
if candidate == "" {
143+
continue
144+
}
145+
for _, subdomain := range session.Extractor.Extract(candidate) {
146+
select {
147+
case <-ctx.Done():
148+
return
149+
case results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: subdomain}:
150+
s.results++
151+
}
152+
}
153+
}
154+
}
155+
156+
// Check pagination conditions
157+
if !data.HasMore || len(data.Results) == 0 {
158+
break
159+
}
160+
161+
// Get sort value for next page
162+
lastResult := data.Results[len(data.Results)-1]
163+
if len(lastResult.Sort) == 0 {
164+
break
165+
}
166+
167+
// Build search_after parameter
168+
sortValues := make([]string, len(lastResult.Sort))
169+
for i, v := range lastResult.Sort {
170+
switch val := v.(type) {
171+
case float64:
172+
sortValues[i] = fmt.Sprintf("%.0f", val)
173+
default:
174+
sortValues[i] = fmt.Sprintf("%v", v)
175+
}
176+
}
177+
searchAfter = strings.Join(sortValues, ",")
178+
currentPage++
179+
}
180+
}
181+
182+
// Name returns the name of the source
183+
func (s *Source) Name() string {
184+
return "urlscan"
185+
}
186+
187+
func (s *Source) IsDefault() bool {
188+
return true
189+
}
190+
191+
func (s *Source) HasRecursiveSupport() bool {
192+
return true
193+
}
194+
195+
func (s *Source) KeyRequirement() subscraping.KeyRequirement {
196+
return subscraping.RequiredKey
197+
}
198+
199+
func (s *Source) NeedsKey() bool {
200+
return s.KeyRequirement() == subscraping.RequiredKey
201+
}
202+
203+
func (s *Source) AddApiKeys(keys []string) {
204+
s.apiKeys = keys
205+
}
206+
207+
func (s *Source) Statistics() subscraping.Statistics {
208+
return subscraping.Statistics{
209+
Errors: s.errors,
210+
Results: s.results,
211+
Requests: s.requests,
212+
TimeTaken: s.timeTaken,
213+
Skipped: s.skipped,
214+
}
215+
}

0 commit comments

Comments
 (0)