Skip to content

Commit 4943066

Browse files
committed
feat: some scraper cleanup
1 parent 074b468 commit 4943066

File tree

8 files changed

+57
-24
lines changed

8 files changed

+57
-24
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,4 @@ jepp_scraper/
66
cache/
77
.env
88
*/**/certs
9+
dump.*

Taskfile.yml

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,18 @@ dotenv: ['.env', '{{.ENV}}/.env.', '{{.HOME}}/.env']
44

55
tasks:
66
default:
7+
desc: List all tasks and descriptions.
78
cmds:
89
- task --list-all
910

11+
build:
12+
desc: Builds both the scraper and server binaries.
13+
deps:
14+
- go:build:scraper
15+
- go:build:server
16+
1017
test:
18+
desc: Runs go tests
1119
cmds:
1220
- go test github.com/ecshreve/jepp/...
1321

@@ -17,6 +25,9 @@ tasks:
1725
- bin/scrape
1826
sources:
1927
- cmd/scrape/*.go
28+
- pkg/scraper/*.go
29+
- pkg/models/*.go
30+
- pkg/utils/*.go
2031
cmds:
2132
- go build -o bin/scrape github.com/ecshreve/jepp/cmd/scrape
2233

@@ -41,12 +52,6 @@ tasks:
4152
cmds:
4253
- swag fmt -d cmd/server,pkg/server,pkg/models,pkg/utils
4354
- swag init --parseVendor -d cmd/server,pkg/server,pkg/models,pkg/utils
44-
45-
build:
46-
desc: Builds both the scraper and server binaries.
47-
deps:
48-
- go:build:scraper
49-
- go:build:server
5055

5156
scrape:
5257
desc: Runs the scraper.

cmd/scrape/main.go

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,27 +4,25 @@ import (
44
"os"
55

66
"github.com/ecshreve/jepp/pkg/models"
7+
"github.com/ecshreve/jepp/pkg/scraper"
78
log "github.com/sirupsen/logrus"
89
)
910

1011
func main() {
1112
if os.Getenv("JEPP_LOCAL_DEV") != "true" {
1213
log.Fatal("this script should only be run in a local development environment")
1314
}
15+
log.SetLevel(log.InfoLevel)
16+
log.Info("Starting Jepp scraper...")
17+
18+
models.GetDBHandle()
1419

1520
// Change loop values to scrape different seasons.
16-
for i := 38; i > 38; i-- {
17-
log.Infof("scraping season %d ", i)
18-
gamesForSeason, err := models.GetGamesBySeason(int64(i))
19-
if err != nil {
21+
for i := 15; i > 10; i-- {
22+
if err := scraper.ScrapeSeason(int64(i)); err != nil {
2023
log.Fatal(err)
2124
}
22-
23-
cluesForSeason := 0
24-
for i, game := range gamesForSeason {
25-
cluesForSeason += scrapeAndFillCluesForGame(nil, game.GameID)
26-
log.Infof("%d/%d games updated", i, len(gamesForSeason))
27-
}
28-
log.Infof("inserted %d clues and %d games for season %d", cluesForSeason, len(gamesForSeason), i)
2925
}
26+
27+
log.Info("...done scraping")
3028
}

pkg/models/category.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,14 @@ func GetCategory(categoryID int64) (*Category, error) {
113113
}
114114

115115
func GetCategoryByName(categoryName string) (*Category, error) {
116-
var c Category
116+
query := fmt.Sprintf("SELECT * FROM category WHERE name='%s' ORDER BY category_id DESC LIMIT 1", categoryName)
117117

118-
if err := db.Get(&c, "SELECT category_id, name FROM category WHERE name=? LIMIT 1", categoryName); err != nil {
118+
c := Category{}
119+
if err := db.Get(&c, query); err != nil {
119120
return nil, oops.Wrapf(err, "could not get category for name %s", categoryName)
120121
}
121122

123+
log.Debugf("category: %+v", c)
122124
return &c, nil
123125
}
124126

pkg/models/game.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ func GetGames() ([]Game, error) {
6161

6262
// GetGamesBySeason returns a list of games in the database for a given season.
6363
func GetGamesBySeason(seasonID int64) ([]Game, error) {
64-
query := fmt.Sprintf("SELECT * FROM game WHERE season_id=%d", seasonID)
64+
query := fmt.Sprintf("SELECT * FROM game WHERE season_id=%d ORDER BY game_date DESC", seasonID)
6565

6666
games := []Game{}
6767
if err := db.Select(&games, query); err != nil {
Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package main
1+
package scraper
22

33
import (
44
"fmt"
@@ -9,7 +9,7 @@ import (
99
log "github.com/sirupsen/logrus"
1010
)
1111

12-
// scrapeGame scrapes a game from j-archive.com
12+
// scrapeGameClues scrapes a game from j-archive.com.
1313
func scrapeGameClues(gameID int64) (map[int64]*mods.Clue, map[int64]string) {
1414
clueMap := map[int64]*mods.Clue{}
1515
clueStrings := map[int64]string{}
@@ -34,6 +34,7 @@ func scrapeGameClues(gameID int64) (map[int64]*mods.Clue, map[int64]string) {
3434
clueStrings[clueId] = cid
3535
})
3636

37+
// collect and parse the categories for single jepp
3738
c.OnHTML("div[id=jeopardy_round]", func(e *colly.HTMLElement) {
3839
cc := []string{}
3940
e.ForEach("td.category_name", func(_ int, el *colly.HTMLElement) {
@@ -42,6 +43,7 @@ func scrapeGameClues(gameID int64) (map[int64]*mods.Clue, map[int64]string) {
4243
cats[mods.Jeopardy] = append(cats[mods.Jeopardy], cc...)
4344
})
4445

46+
// collect and parse the categories for double jepp
4547
c.OnHTML("div[id=double_jeopardy_round]", func(e *colly.HTMLElement) {
4648
cc := []string{}
4749
e.ForEach("td.category_name", func(_ int, el *colly.HTMLElement) {
@@ -50,6 +52,7 @@ func scrapeGameClues(gameID int64) (map[int64]*mods.Clue, map[int64]string) {
5052
cats[mods.DoubleJeopardy] = append(cats[mods.DoubleJeopardy], cc...)
5153
})
5254

55+
// collect and parse the categories for final jepp
5356
c.OnHTML("div[id=final_jeopardy_round]", func(e *colly.HTMLElement) {
5457
cc := []string{}
5558
e.ForEach("td.category_name", func(_ int, el *colly.HTMLElement) {
@@ -81,7 +84,7 @@ func scrapeAndFillCluesForGame(db *mods.JeppDB, gid int64) int {
8184

8285
for clueID, clue := range clues {
8386
actual, err := mods.GetCategoryByName(cats[clueID])
84-
if err != nil {
87+
if actual != nil {
8588
clue.CategoryID = actual.CategoryID
8689
continue
8790
}

pkg/scraper/scraper.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package scraper
2+
3+
import (
4+
"github.com/ecshreve/jepp/pkg/models"
5+
"github.com/samsarahq/go/oops"
6+
log "github.com/sirupsen/logrus"
7+
)
8+
9+
func ScrapeSeason(i int64) error {
10+
log.Infof("scraping season %d ", i)
11+
gamesForSeason, err := models.GetGamesBySeason(int64(i))
12+
if err != nil {
13+
return oops.Wrapf(err, "failed to get games for season %d", i)
14+
}
15+
16+
cluesForSeason := 0
17+
for i, game := range gamesForSeason {
18+
cluesForSeason += scrapeAndFillCluesForGame(nil, game.GameID)
19+
log.Infof("%d/%d games updated", i, len(gamesForSeason))
20+
}
21+
log.Infof("inserted %d clues and %d games for season %d", cluesForSeason, len(gamesForSeason), i)
22+
23+
return nil
24+
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package main
1+
package scraper
22

33
import (
44
"fmt"

0 commit comments

Comments
 (0)