|
| 1 | +// Package urlscan logic |
| 2 | +package urlscan |
| 3 | + |
| 4 | +import ( |
| 5 | + "context" |
| 6 | + "fmt" |
| 7 | + "net/url" |
| 8 | + "strings" |
| 9 | + "time" |
| 10 | + |
| 11 | + jsoniter "github.com/json-iterator/go" |
| 12 | + |
| 13 | + "github.com/projectdiscovery/subfinder/v2/pkg/subscraping" |
| 14 | +) |
| 15 | + |
| 16 | +const ( |
| 17 | + // baseURL is the URLScan API base URL |
| 18 | + baseURL = "https://urlscan.io/api/v1/search/" |
| 19 | + // maxPages is the maximum number of pages to fetch |
| 20 | + maxPages = 5 |
| 21 | + // maxPerPage is the maximum results per page (URLScan max is 10000, but 100 is safer) |
| 22 | + maxPerPage = 100 |
| 23 | +) |
| 24 | + |
| 25 | +// response represents the URLScan API response structure |
| 26 | +type response struct { |
| 27 | + Results []struct { |
| 28 | + Task struct { |
| 29 | + Domain string `json:"domain"` |
| 30 | + URL string `json:"url"` |
| 31 | + } `json:"task"` |
| 32 | + Page struct { |
| 33 | + Domain string `json:"domain"` |
| 34 | + URL string `json:"url"` |
| 35 | + } `json:"page"` |
| 36 | + Sort []interface{} `json:"sort"` |
| 37 | + } `json:"results"` |
| 38 | + HasMore bool `json:"has_more"` |
| 39 | + Total int `json:"total"` |
| 40 | +} |
| 41 | + |
| 42 | +// Source is the passive scraping agent |
| 43 | +type Source struct { |
| 44 | + apiKeys []string |
| 45 | + timeTaken time.Duration |
| 46 | + errors int |
| 47 | + results int |
| 48 | + requests int |
| 49 | + skipped bool |
| 50 | +} |
| 51 | + |
| 52 | +// Run function returns all subdomains found with the service |
| 53 | +func (s *Source) Run(ctx context.Context, domain string, session *subscraping.Session) <-chan subscraping.Result { |
| 54 | + results := make(chan subscraping.Result) |
| 55 | + s.errors = 0 |
| 56 | + s.results = 0 |
| 57 | + s.requests = 0 |
| 58 | + s.skipped = false |
| 59 | + |
| 60 | + go func() { |
| 61 | + defer func(startTime time.Time) { |
| 62 | + s.timeTaken = time.Since(startTime) |
| 63 | + close(results) |
| 64 | + }(time.Now()) |
| 65 | + |
| 66 | + randomApiKey := subscraping.PickRandom(s.apiKeys, s.Name()) |
| 67 | + if randomApiKey == "" { |
| 68 | + s.skipped = true |
| 69 | + return |
| 70 | + } |
| 71 | + |
| 72 | + headers := map[string]string{"api-key": randomApiKey} |
| 73 | + |
| 74 | + // Search with wildcard to get more subdomain results |
| 75 | + s.enumerate(ctx, domain, headers, session, results) |
| 76 | + }() |
| 77 | + |
| 78 | + return results |
| 79 | +} |
| 80 | + |
| 81 | +// enumerate performs the actual enumeration with pagination |
| 82 | +func (s *Source) enumerate(ctx context.Context, domain string, headers map[string]string, session *subscraping.Session, results chan subscraping.Result) { |
| 83 | + var searchAfter string |
| 84 | + currentPage := 0 |
| 85 | + |
| 86 | + for { |
| 87 | + select { |
| 88 | + case <-ctx.Done(): |
| 89 | + return |
| 90 | + default: |
| 91 | + } |
| 92 | + |
| 93 | + if currentPage >= maxPages { |
| 94 | + break |
| 95 | + } |
| 96 | + |
| 97 | + // Build search URL |
| 98 | + searchURL := fmt.Sprintf("%s?q=domain:%s&size=%d", baseURL, url.QueryEscape(domain), maxPerPage) |
| 99 | + if searchAfter != "" { |
| 100 | + searchURL += "&search_after=" + url.QueryEscape(searchAfter) |
| 101 | + } |
| 102 | + |
| 103 | + s.requests++ |
| 104 | + resp, err := session.Get(ctx, searchURL, "", headers) |
| 105 | + if err != nil { |
| 106 | + results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err} |
| 107 | + s.errors++ |
| 108 | + session.DiscardHTTPResponse(resp) |
| 109 | + return |
| 110 | + } |
| 111 | + |
| 112 | + var data response |
| 113 | + err = jsoniter.NewDecoder(resp.Body).Decode(&data) |
| 114 | + if err != nil { |
| 115 | + results <- subscraping.Result{Source: s.Name(), Type: subscraping.Error, Error: err} |
| 116 | + s.errors++ |
| 117 | + session.DiscardHTTPResponse(resp) |
| 118 | + return |
| 119 | + } |
| 120 | + session.DiscardHTTPResponse(resp) |
| 121 | + |
| 122 | + // Process results - extract subdomains from multiple fields |
| 123 | + for _, result := range data.Results { |
| 124 | + candidates := []string{ |
| 125 | + result.Task.Domain, |
| 126 | + result.Page.Domain, |
| 127 | + } |
| 128 | + |
| 129 | + // Also extract from URLs if present |
| 130 | + if result.Task.URL != "" { |
| 131 | + if u, err := url.Parse(result.Task.URL); err == nil { |
| 132 | + candidates = append(candidates, u.Hostname()) |
| 133 | + } |
| 134 | + } |
| 135 | + if result.Page.URL != "" { |
| 136 | + if u, err := url.Parse(result.Page.URL); err == nil { |
| 137 | + candidates = append(candidates, u.Hostname()) |
| 138 | + } |
| 139 | + } |
| 140 | + |
| 141 | + for _, candidate := range candidates { |
| 142 | + if candidate == "" { |
| 143 | + continue |
| 144 | + } |
| 145 | + for _, subdomain := range session.Extractor.Extract(candidate) { |
| 146 | + select { |
| 147 | + case <-ctx.Done(): |
| 148 | + return |
| 149 | + case results <- subscraping.Result{Source: s.Name(), Type: subscraping.Subdomain, Value: subdomain}: |
| 150 | + s.results++ |
| 151 | + } |
| 152 | + } |
| 153 | + } |
| 154 | + } |
| 155 | + |
| 156 | + // Check pagination conditions |
| 157 | + if !data.HasMore || len(data.Results) == 0 { |
| 158 | + break |
| 159 | + } |
| 160 | + |
| 161 | + // Get sort value for next page |
| 162 | + lastResult := data.Results[len(data.Results)-1] |
| 163 | + if len(lastResult.Sort) == 0 { |
| 164 | + break |
| 165 | + } |
| 166 | + |
| 167 | + // Build search_after parameter |
| 168 | + sortValues := make([]string, len(lastResult.Sort)) |
| 169 | + for i, v := range lastResult.Sort { |
| 170 | + switch val := v.(type) { |
| 171 | + case float64: |
| 172 | + sortValues[i] = fmt.Sprintf("%.0f", val) |
| 173 | + default: |
| 174 | + sortValues[i] = fmt.Sprintf("%v", v) |
| 175 | + } |
| 176 | + } |
| 177 | + searchAfter = strings.Join(sortValues, ",") |
| 178 | + currentPage++ |
| 179 | + } |
| 180 | +} |
| 181 | + |
| 182 | +// Name returns the name of the source |
| 183 | +func (s *Source) Name() string { |
| 184 | + return "urlscan" |
| 185 | +} |
| 186 | + |
| 187 | +func (s *Source) IsDefault() bool { |
| 188 | + return true |
| 189 | +} |
| 190 | + |
| 191 | +func (s *Source) HasRecursiveSupport() bool { |
| 192 | + return true |
| 193 | +} |
| 194 | + |
| 195 | +func (s *Source) KeyRequirement() subscraping.KeyRequirement { |
| 196 | + return subscraping.RequiredKey |
| 197 | +} |
| 198 | + |
| 199 | +func (s *Source) NeedsKey() bool { |
| 200 | + return s.KeyRequirement() == subscraping.RequiredKey |
| 201 | +} |
| 202 | + |
| 203 | +func (s *Source) AddApiKeys(keys []string) { |
| 204 | + s.apiKeys = keys |
| 205 | +} |
| 206 | + |
| 207 | +func (s *Source) Statistics() subscraping.Statistics { |
| 208 | + return subscraping.Statistics{ |
| 209 | + Errors: s.errors, |
| 210 | + Results: s.results, |
| 211 | + Requests: s.requests, |
| 212 | + TimeTaken: s.timeTaken, |
| 213 | + Skipped: s.skipped, |
| 214 | + } |
| 215 | +} |
0 commit comments