-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawl.ps1
More file actions
100 lines (79 loc) · 2.71 KB
/
crawl.ps1
File metadata and controls
100 lines (79 loc) · 2.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
param(
[Parameter(Mandatory = $true, Position = 0)]
[string]$startUrl,
[Parameter(Mandatory = $true, Position = 1)]
[string]$urlMatchingRegex,
[Parameter(Mandatory = $true, Position = 2)]
[string]$outputFile,
[Parameter(Mandatory = $false, Position = 3)]
[int]$maxDepth = 3
)
# Use HashSet for efficient lookups and to store unique URLs
$visitedUrls = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase)
$matchingUrls = [System.Collections.Generic.HashSet[string]]::new([System.StringComparer]::OrdinalIgnoreCase)
$queue = [System.Collections.Generic.Queue[object]]::new()
try {
$startUri = [System.Uri]$startUrl
}
catch {
Write-Error "Invalid start URL: $startUrl"
exit 1
}
# Enqueue the starting URL with depth 0
$queue.Enqueue(@{ Url = $startUri; Depth = 0 })
Write-Host "Starting crawl at $startUrl with max depth $maxDepth"
while ($queue.Count -gt 0) {
$currentItem = $queue.Dequeue()
$currentUrl = $currentItem.Url
$currentDepth = $currentItem.Depth
if ($visitedUrls.Contains($currentUrl.AbsoluteUri)) {
continue
}
if ($currentDepth -gt $maxDepth) {
Write-Verbose "Max depth reached for $($currentUrl.AbsoluteUri)"
continue
}
$visitedUrls.Add($currentUrl.AbsoluteUri)
Write-Verbose "Crawling $($currentUrl.AbsoluteUri) at depth $currentDepth"
try {
$response = Invoke-WebRequest -Uri $currentUrl -UseBasicParsing -ErrorAction Stop
}
catch {
Write-Warning "Failed to retrieve $($currentUrl.AbsoluteUri): $_"
continue
}
# Find all links on the page
foreach ($link in $response.Links) {
try {
# Resolve relative URLs to absolute URLs
$absoluteUri = [System.Uri]::new($currentUrl, $link.href)
# Only crawl links within the same domain
if ($absoluteUri.Host -ne $startUri.Host) {
continue
}
# Check if the URL matches the regex
if ($absoluteUri.AbsoluteUri -match $urlMatchingRegex) {
if ($matchingUrls.Add($absoluteUri.AbsoluteUri)) {
Write-Host "Found matching URL: $($absoluteUri.AbsoluteUri)"
}
}
# If not visited and not an image or similar file, add to the queue for crawling
if (-not $visitedUrls.Contains($absoluteUri.AbsoluteUri) -and $absoluteUri.AbsoluteUri.EndsWith("/")) {
$queue.Enqueue(@{ Url = $absoluteUri; Depth = $currentDepth + 1 })
}
}
catch {
Write-Warning "Could not process link '$($link.href)' on page $($currentUrl.AbsoluteUri): $_"
}
}
# Save the results to the output file
try {
$matchingUrls | Sort-Object | Out-File -FilePath $outputFile -Encoding utf8 -ErrorAction Stop
Write-Host "Results saved to $outputFile"
}
catch {
Write-Error "Failed to write to output file $outputFile : $_"
exit 1
}
}
Write-Host "Crawl finished. Found $($matchingUrls.Count) matching URLs."