Merge pull request #1710 from projectdiscovery/add-urlscan-source

dogancanbakir · web-flow · commit 8a57c4f44617 · 2026-02-03T20:12:44.000+03:00
feat: add URLScan.io as passive subdomain source
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -65,6 +65,7 @@ jobs:
           SECURITYTRAILS_API_KEY: ${{secrets.SECURITYTRAILS_API_KEY}}
           SHODAN_API_KEY: ${{secrets.SHODAN_API_KEY}}
           THREATBOOK_API_KEY: ${{secrets.THREATBOOK_API_KEY}}
+          URLSCAN_API_KEY: ${{secrets.URLSCAN_API_KEY}}
           VIRUSTOTAL_API_KEY: ${{secrets.VIRUSTOTAL_API_KEY}}
           WHOISXMLAPI_API_KEY: ${{secrets.WHOISXMLAPI_API_KEY}}
           ZOOMEYEAPI_API_KEY: ${{secrets.ZOOMEYEAPI_API_KEY}}
diff --git a/pkg/passive/sources.go b/pkg/passive/sources.go
@@ -53,6 +53,7 @@ import (
 	"github.com/projectdiscovery/subfinder/v2/pkg/subscraping/sources/thc"
 	"github.com/projectdiscovery/subfinder/v2/pkg/subscraping/sources/threatbook"
 	"github.com/projectdiscovery/subfinder/v2/pkg/subscraping/sources/threatcrowd"
+	"github.com/projectdiscovery/subfinder/v2/pkg/subscraping/sources/urlscan"
 	"github.com/projectdiscovery/subfinder/v2/pkg/subscraping/sources/virustotal"
 	"github.com/projectdiscovery/subfinder/v2/pkg/subscraping/sources/waybackarchive"
 	"github.com/projectdiscovery/subfinder/v2/pkg/subscraping/sources/whoisxmlapi"
@@ -114,6 +115,7 @@ var AllSources = [...]subscraping.Source{
 	&hudsonrock.Source{},
 	&digitalyama.Source{},
 	&thc.Source{},
+	&urlscan.Source{},
 }
 
 var sourceWarnings = mapsutil.NewSyncLockMap[string, string](
diff --git a/pkg/passive/sources_test.go b/pkg/passive/sources_test.go
@@ -63,6 +63,7 @@ var (
 		"digitalyama",
 		"merklemap",
 		"thc",
+		"urlscan",
 	}
 
 	expectedDefaultSources = []string{
@@ -105,6 +106,7 @@ var (
 		"builtwith",
 		"digitalyama",
 		"thc",
+		"urlscan",
 	}
 
 	expectedDefaultRecursiveSources = []string{
@@ -121,6 +123,7 @@ var (
 		"leakix",
 		"facebook",
 		"merklemap",
+		"urlscan",
 		// "reconcloud",
 	}
 )
diff --git a/pkg/passive/sources_wo_auth_test.go b/pkg/passive/sources_wo_auth_test.go
@@ -34,6 +34,9 @@ func TestSourcesWithoutKeys(t *testing.T) {
 		"dnsdumpster",    // failing with "unexpected status code 403 received"
 		"anubis",         // failing with "too many redirects"
 		"threatcrowd",    // failing with "randomly failing with unmarshal error when hit multiple times"
+		"leakix",         // now requires API key (returns 401)
+		"reconeer",       // now requires API key (returns 401)
+		"sitedossier",    // flaky - returns no results in CI
 	}
 
 	domain := "hackerone.com"
diff --git a/pkg/runner/options.go b/pkg/runner/options.go
@@ -265,4 +265,5 @@ var defaultRateLimits = []string{
 	// "gitlab=2/s",
 	"github=83/m",
 	"hudsonrock=5/s",
+	"urlscan=1/s",
 }
diff --git a/pkg/subscraping/sources/facebook/ctlogs.go b/pkg/subscraping/sources/facebook/ctlogs.go
@@ -10,7 +10,6 @@ import (
 	"github.com/projectdiscovery/gologger"
 	"github.com/projectdiscovery/retryablehttp-go"
 	"github.com/projectdiscovery/subfinder/v2/pkg/subscraping"
-	errorutil "github.com/projectdiscovery/utils/errors"
 	"github.com/projectdiscovery/utils/generic"
 	urlutil "github.com/projectdiscovery/utils/url"
 )
@@ -128,7 +127,7 @@ func (s *Source) Run(ctx context.Context, domain string, session *subscraping.Se
 			response := &response{}
 			if err := json.Unmarshal(bin, response); err != nil {
 				s.errors++
-				results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: errorutil.NewWithErr(err).Msgf("failed to unmarshal response: %s", string(bin))}
+				results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: fmt.Errorf("failed to unmarshal response: %s: %w", string(bin), err)}
 				return
 			}
 			for _, v := range response.Data {
diff --git a/pkg/subscraping/sources/urlscan/urlscan.go b/pkg/subscraping/sources/urlscan/urlscan.go
@@ -0,0 +1,215 @@
+// Package urlscan logic
+package urlscan
+
+import (
+	"context"
+	"fmt"
+	"net/url"
+	"strings"
+	"time"
+
+	jsoniter "github.com/json-iterator/go"
+
+	"github.com/projectdiscovery/subfinder/v2/pkg/subscraping"
+)
+
+const (
+	// baseURL is the URLScan API base URL
+	baseURL = "https://urlscan.io/api/v1/search/"
+	// maxPages is the maximum number of pages to fetch
+	maxPages = 5
+	// maxPerPage is the maximum results per page (URLScan max is 10000, but 100 is safer)
+	maxPerPage = 100
+)
+
+// response represents the URLScan API response structure
+type response struct {
+	Results []struct {
+		Task struct {
+			Domain string `json:"domain"`
+			URL    string `json:"url"`
+		} `json:"task"`
+		Page struct {
+			Domain string `json:"domain"`
+			URL    string `json:"url"`
+		} `json:"page"`
+		Sort []interface{} `json:"sort"`
+	} `json:"results"`
+	HasMore bool `json:"has_more"`
+	Total   int  `json:"total"`
+}
+
+// Source is the passive scraping agent
+type Source struct {
+	apiKeys   []string
+	timeTaken time.Duration
+	errors    int
+	results   int
+	requests  int
+	skipped   bool
+}
+
+// Run function returns all subdomains found with the service
+func (s *Source) Run(ctx context.Context, domain string, session *subscraping.Session) <-chan subscraping.Result {
+	results := make(chan subscraping.Result)
+	s.errors = 0
+	s.results = 0
+	s.requests = 0
+	s.skipped = false
+
+	go func() {
+		defer func(startTime time.Time) {
+			s.timeTaken = time.Since(startTime)
+			close(results)
+		}(time.Now())
+
+		randomApiKey := subscraping.PickRandom(s.apiKeys, s.Name())
+		if randomApiKey == "" {
+			s.skipped = true
+			return
+		}
+
+		headers := map[string]string{"api-key": randomApiKey}
+
+		// Search with wildcard to get more subdomain results
+		s.enumerate(ctx, domain, headers, session, results)
+	}()
+
+	return results
+}
+
+// enumerate performs the actual enumeration with pagination
+func (s *Source) enumerate(ctx context.Context, domain string, headers map[string]string, session *subscraping.Session, results chan subscraping.Result) {
+	var searchAfter string
+	currentPage := 0
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		default:
+		}
+
+		if currentPage >= maxPages {
+			break
+		}
+
+		// Build search URL
+		searchURL := fmt.Sprintf("%s?q=domain:%s&size=%d", baseURL, url.QueryEscape(domain), maxPerPage)
+		if searchAfter != "" {
+			searchURL += "&search_after=" + url.QueryEscape(searchAfter)
+		}
+
+		s.requests++
+		resp, err := session.Get(ctx, searchURL, "", headers)
+		if err != nil {
+			results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
+			s.errors++
+			session.DiscardHTTPResponse(resp)
+			return
+		}
+
+		var data response
+		err = jsoniter.NewDecoder(resp.Body).Decode(&data)
+		if err != nil {
+			results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err}
+			s.errors++
+			session.DiscardHTTPResponse(resp)
+			return
+		}
+		session.DiscardHTTPResponse(resp)
+
+		// Process results - extract subdomains from multiple fields
+		for _, result := range data.Results {
+			candidates := []string{
+				result.Task.Domain,
+				result.Page.Domain,
+			}
+
+			// Also extract from URLs if present
+			if result.Task.URL != "" {
+				if u, err := url.Parse(result.Task.URL); err == nil {
+					candidates = append(candidates, u.Hostname())
+				}
+			}
+			if result.Page.URL != "" {
+				if u, err := url.Parse(result.Page.URL); err == nil {
+					candidates = append(candidates, u.Hostname())
+				}
+			}
+
+			for _, candidate := range candidates {
+				if candidate == "" {
+					continue
+				}
+				for _, subdomain := range session.Extractor.Extract(candidate) {
+					select {
+					case <-ctx.Done():
+						return
+					case results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: subdomain}:
+						s.results++
+					}
+				}
+			}
+		}
+
+		// Check pagination conditions
+		if !data.HasMore || len(data.Results) == 0 {
+			break
+		}
+
+		// Get sort value for next page
+		lastResult := data.Results[len(data.Results)-1]
+		if len(lastResult.Sort) == 0 {
+			break
+		}
+
+		// Build search_after parameter
+		sortValues := make([]string, len(lastResult.Sort))
+		for i, v := range lastResult.Sort {
+			switch val := v.(type) {
+			case float64:
+				sortValues[i] = fmt.Sprintf("%.0f", val)
+			default:
+				sortValues[i] = fmt.Sprintf("%v", v)
+			}
+		}
+		searchAfter = strings.Join(sortValues, ",")
+		currentPage++
+	}
+}
+
+// Name returns the name of the source
+func (s *Source) Name() string {
+	return "urlscan"
+}
+
+func (s *Source) IsDefault() bool {
+	return true
+}
+
+func (s *Source) HasRecursiveSupport() bool {
+	return true
+}
+
+func (s *Source) KeyRequirement() subscraping.KeyRequirement {
+	return subscraping.RequiredKey
+}
+
+func (s *Source) NeedsKey() bool {
+	return s.KeyRequirement() == subscraping.RequiredKey
+}
+
+func (s *Source) AddApiKeys(keys []string) {
+	s.apiKeys = keys
+}
+
+func (s *Source) Statistics() subscraping.Statistics {
+	return subscraping.Statistics{
+		Errors:    s.errors,
+		Results:   s.results,
+		Requests:  s.requests,
+		TimeTaken: s.timeTaken,
+		Skipped:   s.skipped,
+	}
+}

Original file line number	Diff line number	Diff line change
`@@ -63,6 +63,7 @@ var (`
`63`	`63`	`"digitalyama",`
`64`	`64`	`"merklemap",`
`65`	`65`	`"thc",`
	`66`	`+ "urlscan",`
`66`	`67`	`}`
`67`	`68`
`68`	`69`	`expectedDefaultSources = []string{`
`@@ -105,6 +106,7 @@ var (`
`105`	`106`	`"builtwith",`
`106`	`107`	`"digitalyama",`
`107`	`108`	`"thc",`
	`109`	`+ "urlscan",`
`108`	`110`	`}`
`109`	`111`
`110`	`112`	`expectedDefaultRecursiveSources = []string{`
`@@ -121,6 +123,7 @@ var (`
`121`	`123`	`"leakix",`
`122`	`124`	`"facebook",`
`123`	`125`	`"merklemap",`
	`126`	`+ "urlscan",`
`124`	`127`	`// "reconcloud",`
`125`	`128`	`}`
`126`	`129`	`)`
Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,9 @@ func TestSourcesWithoutKeys(t *testing.T) {`
`34`	`34`	`"dnsdumpster", // failing with "unexpected status code 403 received"`
`35`	`35`	`"anubis", // failing with "too many redirects"`
`36`	`36`	`"threatcrowd", // failing with "randomly failing with unmarshal error when hit multiple times"`
	`37`	`+ "leakix", // now requires API key (returns 401)`
	`38`	`+ "reconeer", // now requires API key (returns 401)`
	`39`	`+ "sitedossier", // flaky - returns no results in CI`
`37`	`40`	`}`
`38`	`41`
`39`	`42`	`domain := "hackerone.com"`
Original file line number	Diff line number	Diff line change
`@@ -265,4 +265,5 @@ var defaultRateLimits = []string{`
`265`	`265`	`// "gitlab=2/s",`
`266`	`266`	`"github=83/m",`
`267`	`267`	`"hudsonrock=5/s",`
	`268`	`+ "urlscan=1/s",`
`268`	`269`	`}`