-
-
Notifications
You must be signed in to change notification settings - Fork 137
Expand file tree
/
Copy pathscraper.go
More file actions
44 lines (35 loc) · 996 Bytes
/
scraper.go
File metadata and controls
44 lines (35 loc) · 996 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
package main
import (
"fmt"
"time"
"github.com/gocolly/colly"
)
func main() {
c := colly.NewCollector(
colly.AllowedDomains("gabrieltanner.org"),
)
// Callback for when a scraped page contains an article container
c.OnHTML(".article-container", func(e *colly.HTMLElement) {
fmt.Println("Article heading: ", e.DOM.Find("h1").Text())
})
// Callback for links on scraped pages
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
// Extract the linked URLs from the anchor tag
link := e.Attr("href")
// Have your crawler visit the linked URL
c.Visit(e.Request.AbsoluteURL(link))
})
// Callback getting called when the scraping process is finished
c.OnScraped(func(r *colly.Response) {
fmt.Println("Finished scraping!")
})
// Set a rate limit for the crawler
c.Limit(&colly.LimitRule{
DomainGlob: "*",
RandomDelay: 1 * time.Second,
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})
c.Visit("https://gabrieltanner.org")
}