Skip to content

Commit deaf9cb

Browse files
authored
Refactoring/main (#224)
* add config package * Refactor scraper main.go file * Remove unused init function * fix error message typo * update comments * Refactor file type handling in config and main.go * move ip check function to utils package * Remove unused variables and comments * Refactor JSON file writing and review sorting * comments * comments * formatting
1 parent 0820aba commit deaf9cb

File tree

6 files changed

+143
-105
lines changed

6 files changed

+143
-105
lines changed

scraper/go.mod

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
module github.com/algo7/TripAdvisor-Review-Scraper/scraper
22

3-
go 1.21
3+
go 1.21.1
4+
5+
toolchain go1.21.4

scraper/go.sum

Whitespace-only changes.

scraper/internal/config/config.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package config
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"strings"
7+
)
8+
9+
// Config is a struct that represents the configuration for the scraper
10+
type Config struct {
11+
LocationURL string
12+
Languages []string
13+
FileType string
14+
ProxyHost string
15+
}
16+
17+
// NewConfig is a function that returns a new Config struct
18+
// Returns an error if the LOCATION_URL is not set
19+
func NewConfig() (*Config, error) {
20+
// Default languages
21+
defaultLanguages := []string{"en"}
22+
23+
// Get location URL
24+
locationURL := os.Getenv("LOCATION_URL")
25+
if locationURL == "" {
26+
return nil, fmt.Errorf("LOCATION_URL not set")
27+
}
28+
29+
// Get languages
30+
languages := defaultLanguages
31+
if envLang := os.Getenv("LANGUAGES"); envLang != "" {
32+
languages = strings.Split(envLang, "|")
33+
}
34+
35+
// Get file type
36+
fileType := strings.ToLower(os.Getenv("FILETYPE"))
37+
if fileType == "" {
38+
fileType = "csv"
39+
}
40+
41+
if fileType != "csv" && fileType != "json" {
42+
return nil, fmt.Errorf("invalid file type. Use csv or json")
43+
}
44+
45+
// Get proxy host
46+
proxyHost := os.Getenv("PROXY_HOST")
47+
48+
return &Config{
49+
LocationURL: locationURL,
50+
Languages: languages,
51+
FileType: fileType,
52+
ProxyHost: proxyHost,
53+
}, nil
54+
}

scraper/main.go

Lines changed: 27 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -2,63 +2,38 @@ package main
22

33
import (
44
"encoding/csv"
5-
"encoding/json"
5+
"fmt"
66
"log"
77
"math/rand"
88
"net/http"
99
"os"
10-
"sort"
1110
"strconv"
12-
"strings"
1311
"time"
1412

13+
"github.com/algo7/TripAdvisor-Review-Scraper/scraper/internal/config"
1514
"github.com/algo7/TripAdvisor-Review-Scraper/scraper/pkg/tripadvisor"
16-
)
17-
18-
var (
19-
// LANGUAGES is a slice of languages to be used for scraping, default is English
20-
// var LANGUAGES = []string{"en", "fr", "pt", "es", "de", "it", "ru", "ja", "zh", "ko", "nl", "sv", "da", "fi", "no", "pl", "hu", "cs", "el", "tr", "th", "ar", "he", "id", "ms", "vi", "tl", "uk", "ro", "bg", "hr", "sr", "sk", "sl", "et", "lv", "lt", "sq", "mk", "hi", "bn", "pa", "gu", "ta", "te", "kn", "ml", "mr", "ur", "fa", "ne", "si", "my", "km", "lo", "am", "ka", "hy", "az", "uz", "tk", "ky", "tg", "mn", "bo", "sd", "ps", "ku", "gl", "eu", "ca", "is", "af", "xh", "zu", "ny", "st", "tn", "sn", "sw", "rw", "so", "mg", "eo", "cy", "gd", "gv", "ga", "mi", "sm", "to", "haw", "id", "jw"}
21-
LANGUAGES = []string{"en"}
22-
23-
// FILETYPE is the type of file to be saved, default is csv
24-
FILETYPE = "csv"
15+
"github.com/algo7/TripAdvisor-Review-Scraper/scraper/pkg/utils"
2516
)
2617

2718
func main() {
2819
// Scraper variables
2920
var allReviews []tripadvisor.Review
3021
var location tripadvisor.Location
3122

32-
// Get the location URL from the environment variable
33-
locationURL := os.Getenv("LOCATION_URL")
34-
log.Printf("Location URL: %s", locationURL)
35-
36-
// Get the languages from the environment variable of use "en" as default
37-
languages := LANGUAGES
38-
if os.Getenv("LANGUAGES") != "" {
39-
languages = strings.Split(os.Getenv("LANGUAGES"), "|")
40-
}
41-
log.Printf("Languages: %v", languages)
42-
43-
// Get the file type from the environment variable or use "csv" as default
44-
fileType := FILETYPE
45-
if os.Getenv("FILETYPE") != "" {
46-
fileType = os.Getenv("FILETYPE")
47-
}
48-
if fileType != "csv" && fileType != "json" {
49-
log.Fatal("Invalid file type. Use csv or json")
23+
config, err := config.NewConfig()
24+
if err != nil {
25+
log.Fatalf("Error creating scrape config: %v", err)
5026
}
51-
log.Printf("File Type: %s", fileType)
5227

5328
// Get the query type from the URL
54-
queryType := tripadvisor.GetURLType(locationURL)
29+
queryType := tripadvisor.GetURLType(config.LocationURL)
5530
if queryType == "" {
5631
log.Fatal("Invalid URL")
5732
}
5833
log.Printf("Location Type: %s", queryType)
5934

6035
// Parse the location ID and location name from the URL
61-
locationID, locationName, err := tripadvisor.ParseURL(locationURL, queryType)
36+
locationID, locationName, err := tripadvisor.ParseURL(config.LocationURL, queryType)
6237
if err != nil {
6338
log.Fatalf("Error parsing URL: %v", err)
6439
}
@@ -68,34 +43,31 @@ func main() {
6843
// Get the query ID for the given query type.
6944
queryID := tripadvisor.GetQueryID(queryType)
7045
if err != nil {
71-
log.Fatal("The location ID must be an positive integer")
46+
log.Fatal("The location ID must be a positive integer")
7247
}
7348

74-
// Get the proxy host if set
75-
proxyHost := os.Getenv("PROXY_HOST")
76-
7749
// The default HTTP client
7850
client := &http.Client{}
7951

8052
// If the proxy host is set, use the proxy client
81-
if proxyHost != "" {
53+
if config.ProxyHost != "" {
8254

8355
// Get the HTTP client with the proxy
84-
client, err = tripadvisor.GetHTTPClientWithProxy(proxyHost)
56+
client, err = tripadvisor.GetHTTPClientWithProxy(config.ProxyHost)
8557
if err != nil {
86-
log.Fatalf("Error creating HTTP client with the give proxy %s: %v", proxyHost, err)
58+
log.Fatalf("Error creating HTTP client with the give proxy %s: %v", config.ProxyHost, err)
8759
}
8860

8961
// Check IP
90-
ip, err := tripadvisor.CheckIP(client)
62+
ip, err := utils.CheckIP(client)
9163
if err != nil {
9264
log.Fatalf("Error checking IP: %v", err)
9365
}
9466
log.Printf("Proxy IP: %s", ip)
9567
}
9668

9769
// Fetch the review count for the given location ID
98-
reviewCount, err := tripadvisor.FetchReviewCount(client, locationID, queryType, languages)
70+
reviewCount, err := tripadvisor.FetchReviewCount(client, locationID, queryType, config.Languages)
9971
if err != nil {
10072
log.Fatalf("Error fetching review count: %v", err)
10173
}
@@ -105,7 +77,7 @@ func main() {
10577
log.Printf("Review count: %d", reviewCount)
10678

10779
// Create a file to save the reviews data
108-
fileName := "reviews." + fileType
80+
fileName := fmt.Sprintf("reviews.%s", config.FileType)
10981
fileHandle, err := os.Create(fileName)
11082
if err != nil {
11183
log.Fatalf("Error creating file %s: %v", fileName, err)
@@ -131,7 +103,7 @@ func main() {
131103
offset := tripadvisor.CalculateOffset(i)
132104

133105
// Make the request to the TripAdvisor GraphQL endpoint
134-
resp, err := tripadvisor.MakeRequest(client, queryID, languages, locationID, offset, 20)
106+
resp, err := tripadvisor.MakeRequest(client, queryID, config.Languages, locationID, offset, 20)
135107
if err != nil {
136108
log.Fatalf("Error making request at iteration %d: %v", i, err)
137109
}
@@ -156,7 +128,7 @@ func main() {
156128
// Store the location data
157129
location = response[0].Data.Locations[0].Location
158130

159-
if fileType == "csv" {
131+
if config.FileType == "csv" {
160132
// Iterating over the reviews
161133
for _, row := range reviews {
162134
row := []string{
@@ -177,7 +149,8 @@ func main() {
177149
}
178150

179151
}
180-
if fileType == "csv" {
152+
153+
if config.FileType == "csv" {
181154
// Create a new csv writer. We are using writeAll so defer writer.Flush() is not required
182155
writer := csv.NewWriter(fileHandle)
183156

@@ -192,45 +165,19 @@ func main() {
192165
if err != nil {
193166
log.Fatalf("Error writing data to csv: %v", err)
194167
}
195-
} else {
196-
// Write the data to the JSON file
197-
const layout = "2006-01-02"
198-
199-
sort.Slice(allReviews, func(i, j int) bool {
200-
iTime, err := time.Parse(layout, allReviews[i].CreatedDate)
201-
if err != nil {
202-
log.Fatalf("Error parsing time: %v", err)
203-
}
204-
205-
jTime, err := time.Parse(layout, allReviews[j].CreatedDate)
206-
if err != nil {
207-
log.Fatalf("Error parsing time: %v", err)
208-
}
168+
}
209169

210-
return jTime.After(iTime)
211-
})
170+
// If the file type is JSON, write the data to the file
171+
if config.FileType == "json" {
172+
// Sort the reviews by date
173+
tripadvisor.SortReviewsByDate(allReviews)
212174

213-
feedback := tripadvisor.Feedback{
214-
Location: location,
215-
Reviews: allReviews,
216-
}
217-
data, err := json.Marshal(feedback)
218-
if err != nil {
219-
log.Fatalf("Could not marshal data: %v", err)
220-
}
221-
_, err = fileHandle.Write(data)
175+
// Write the data to the JSON file
176+
err := tripadvisor.WriteReviewsToJSONFile(allReviews, location, fileHandle)
222177
if err != nil {
223-
log.Fatalf("Could not write data: %v", err)
178+
log.Fatalf("Error writing data to JSON file: %v", err)
224179
}
225180
}
226-
227181
log.Printf("Data written to %s", fileName)
228182
log.Println("Scrapping completed")
229183
}
230-
231-
func init() {
232-
// Check if the environment variables are set
233-
if os.Getenv("LOCATION_URL") == "" {
234-
log.Fatal("LOCATION_URL not set")
235-
}
236-
}

scraper/pkg/tripadvisor/tripadvisor.go

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,10 @@ import (
88
"log"
99
"net/http"
1010
"os"
11+
"sort"
1112
"strconv"
1213
"strings"
14+
"time"
1315
)
1416

1517
// MakeRequest is a function that sends a POST request to the TripAdvisor GraphQL endpoint
@@ -143,30 +145,6 @@ func FetchReviewCount(client *http.Client, locationID uint32, queryType string,
143145
return 0, fmt.Errorf("no reviews found for location ID %d", locationID)
144146
}
145147

146-
// CheckIP takes in a http client and calls ipinfo.io/ip to check the current IP address
147-
func CheckIP(client *http.Client) (ip string, err error) {
148-
149-
// Make the request to ipinfo.io/ip
150-
resp, err := client.Get("https://ipinfo.io/ip")
151-
if err != nil {
152-
return "", fmt.Errorf("error getting IP address: %w", err)
153-
}
154-
defer resp.Body.Close()
155-
156-
// Check the response status code
157-
if resp.StatusCode != http.StatusOK {
158-
return "", fmt.Errorf("error response status code: %d", resp.StatusCode)
159-
}
160-
161-
// Read the response body
162-
responseBody, err := io.ReadAll(resp.Body)
163-
if err != nil {
164-
return "", fmt.Errorf("error reading response body: %w", err)
165-
}
166-
167-
return string(responseBody), nil
168-
}
169-
170148
// CalculateIterations is a function that calculates the number of iterations required to fetch all reviews
171149
func CalculateIterations(reviewCount uint32) (iterations uint32) {
172150

@@ -249,3 +227,29 @@ func ParseURL(url string, locationType string) (locationID uint32, locationName
249227
return 0, "", fmt.Errorf("invalid location type: %s", locationType)
250228
}
251229
}
230+
231+
func WriteReviewsToJSONFile(reviews []Review, location Location, fileHandle *os.File) error {
232+
feedback := Feedback{
233+
Location: location,
234+
Reviews: reviews,
235+
}
236+
data, err := json.Marshal(feedback)
237+
if err != nil {
238+
return fmt.Errorf("could not marshal data: %w", err)
239+
}
240+
if _, err := fileHandle.Write(data); err != nil {
241+
return fmt.Errorf("could not write data to file: %w", err)
242+
}
243+
return nil
244+
}
245+
246+
// SortReviewsByDate is a function that sorts the reviews by date
247+
// This function modifies the original slice
248+
func SortReviewsByDate(reviews []Review) {
249+
const layout = "2006-01-02" // Move the layout constant here to keep it scoped to the sorting logic
250+
sort.Slice(reviews, func(i, j int) bool {
251+
iTime, _ := time.Parse(layout, reviews[i].CreatedDate) // Assume error handling is done elsewhere or errors are unlikely
252+
jTime, _ := time.Parse(layout, reviews[j].CreatedDate)
253+
return iTime.After(jTime)
254+
})
255+
}

scraper/pkg/utils/utils.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
package utils
2+
3+
import (
4+
"fmt"
5+
"io"
6+
"net/http"
7+
)
8+
9+
// CheckIP takes in a http client and calls ipinfo.io/ip to check the current IP address
10+
func CheckIP(client *http.Client) (ip string, err error) {
11+
12+
// Make the request to ipinfo.io/ip
13+
resp, err := client.Get("https://ipinfo.io/ip")
14+
if err != nil {
15+
return "", fmt.Errorf("error getting IP address: %w", err)
16+
}
17+
defer resp.Body.Close()
18+
19+
// Check the response status code
20+
if resp.StatusCode != http.StatusOK {
21+
return "", fmt.Errorf("error response status code: %d", resp.StatusCode)
22+
}
23+
24+
// Read the response body
25+
responseBody, err := io.ReadAll(resp.Body)
26+
if err != nil {
27+
return "", fmt.Errorf("error reading response body: %w", err)
28+
}
29+
30+
return string(responseBody), nil
31+
}

0 commit comments

Comments
 (0)