Skip to content

Commit b0826af

Browse files
committed
Add support for s3 storage
1 parent 1a0bcfa commit b0826af

File tree

6 files changed

+199
-2
lines changed

6 files changed

+199
-2
lines changed

.env.example

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@
1717
# in the "results" directory, relative to the current working directory.
1818
TOBEY_RESULT_REPORTER_DSN=disk://results
1919
# TOBEY_RESULT_REPORTER_DSN=webhook://host/path
20+
# TOBEY_RESULT_REPORTER_DSN=s3://bucket-name/optional/prefix
21+
22+
# When using S3 storage, configure your AWS credentials:
23+
# AWS_ACCESS_KEY_ID=your-access-key
24+
# AWS_SECRET_ACCESS_KEY=your-secret-key
25+
# AWS_REGION=us-east-1 # Optional, defaults to us-east-1
2026

2127
# DSN for progress reporting. By default, a console progress reporter is used. Uncomment to report progress to the
2228
# Factorial service or disable progress reporting.

README.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ default user agent (`Tobey/0`) for a specific run via the `ua` field in the craw
9090
}
9191
```
9292

93-
#### Attaching Metadata
93+
### Attaching Metadata
9494

9595
Arbitrary metadata can be provided along your crawl request. This metadata is internally
9696
associated with the run that is created for your request and will be part of each
@@ -209,7 +209,7 @@ usually contains the following data, its exact format depends a little bit on th
209209
### Result Reporters
210210

211211
Tobey currently supports multiple methods to handle results. You can either store
212-
them locally on disk, or forward them to a webhook endpoint.
212+
them locally on disk, forward them to a webhook endpoint, or store them in Amazon S3.
213213

214214
When you configure the crawler to **store results on disk**, it will save the results
215215
to the local filesystem. By default the results are saved in the same directory as the crawl
@@ -219,6 +219,18 @@ request if not otherwise configured.
219219
TOBEY_RESULT_REPORTER_DSN=disk:///path/to/results
220220
```
221221

222+
When you configure the crawler to **store results in S3**, it will save the results
223+
to the specified S3 bucket. The results will be organized in a directory structure
224+
under the optional prefix:
225+
226+
```sh
227+
TOBEY_RESULT_REPORTER_DSN=s3://bucket-name/optional/prefix
228+
```
229+
230+
Note: When using S3 storage, make sure you have proper AWS credentials configured
231+
either through environment variables (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`) or
232+
through AWS IAM roles.
233+
222234
[Webhooks](https://mailchimp.com/en/marketing-glossary/webhook) are a technique to notify other services about a result, once its ready.
223235
When you configure the crawler to **forward results to a webhook**, it will deliver the results to a configured webhook endpoint.
224236

go.mod

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,24 @@ require (
4343
github.com/alicebob/gopher-json v0.0.0-20200520072559-a9ecdc9d1d3a // indirect
4444
github.com/andybalholm/cascadia v1.3.2 // indirect
4545
github.com/antchfx/xpath v1.3.1 // indirect
46+
github.com/aws/aws-sdk-go-v2 v1.36.3 // indirect
47+
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.10 // indirect
48+
github.com/aws/aws-sdk-go-v2/config v1.29.9 // indirect
49+
github.com/aws/aws-sdk-go-v2/credentials v1.17.62 // indirect
50+
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 // indirect
51+
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34 // indirect
52+
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34 // indirect
53+
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 // indirect
54+
github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.34 // indirect
55+
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 // indirect
56+
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.6.2 // indirect
57+
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 // indirect
58+
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.15 // indirect
59+
github.com/aws/aws-sdk-go-v2/service/s3 v1.78.1 // indirect
60+
github.com/aws/aws-sdk-go-v2/service/sso v1.25.1 // indirect
61+
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.29.1 // indirect
62+
github.com/aws/aws-sdk-go-v2/service/sts v1.33.17 // indirect
63+
github.com/aws/smithy-go v1.22.2 // indirect
4664
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
4765
github.com/beorn7/perks v1.0.1 // indirect
4866
github.com/bits-and-blooms/bitset v1.13.0 // indirect

go.sum

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,42 @@ github.com/antchfx/xmlquery v1.4.1 h1:YgpSwbeWvLp557YFTi8E3z6t6/hYjmFEtiEKbDfEbl
1212
github.com/antchfx/xmlquery v1.4.1/go.mod h1:lKezcT8ELGt8kW5L+ckFMTbgdR61/odpPgDv8Gvi1fI=
1313
github.com/antchfx/xpath v1.3.1 h1:PNbFuUqHwWl0xRjvUPjJ95Agbmdj2uzzIwmQKgu4oCk=
1414
github.com/antchfx/xpath v1.3.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
15+
github.com/aws/aws-sdk-go-v2 v1.36.3 h1:mJoei2CxPutQVxaATCzDUjcZEjVRdpsiiXi2o38yqWM=
16+
github.com/aws/aws-sdk-go-v2 v1.36.3/go.mod h1:LLXuLpgzEbD766Z5ECcRmi8AzSwfZItDtmABVkRLGzg=
17+
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.10 h1:zAybnyUQXIZ5mok5Jqwlf58/TFE7uvd3IAsa1aF9cXs=
18+
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.10/go.mod h1:qqvMj6gHLR/EXWZw4ZbqlPbQUyenf4h82UQUlKc+l14=
19+
github.com/aws/aws-sdk-go-v2/config v1.29.9 h1:Kg+fAYNaJeGXp1vmjtidss8O2uXIsXwaRqsQJKXVr+0=
20+
github.com/aws/aws-sdk-go-v2/config v1.29.9/go.mod h1:oU3jj2O53kgOU4TXq/yipt6ryiooYjlkqqVaZk7gY/U=
21+
github.com/aws/aws-sdk-go-v2/credentials v1.17.62 h1:fvtQY3zFzYJ9CfixuAQ96IxDrBajbBWGqjNTCa79ocU=
22+
github.com/aws/aws-sdk-go-v2/credentials v1.17.62/go.mod h1:ElETBxIQqcxej++Cs8GyPBbgMys5DgQPTwo7cUPDKt8=
23+
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 h1:x793wxmUWVDhshP8WW2mlnXuFrO4cOd3HLBroh1paFw=
24+
github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30/go.mod h1:Jpne2tDnYiFascUEs2AWHJL9Yp7A5ZVy3TNyxaAjD6M=
25+
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34 h1:ZK5jHhnrioRkUNOc+hOgQKlUL5JeC3S6JgLxtQ+Rm0Q=
26+
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34/go.mod h1:p4VfIceZokChbA9FzMbRGz5OV+lekcVtHlPKEO0gSZY=
27+
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34 h1:SZwFm17ZUNNg5Np0ioo/gq8Mn6u9w19Mri8DnJ15Jf0=
28+
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34/go.mod h1:dFZsC0BLo346mvKQLWmoJxT+Sjp+qcVR1tRVHQGOH9Q=
29+
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 h1:bIqFDwgGXXN1Kpp99pDOdKMTTb5d2KyU5X/BZxjOkRo=
30+
github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3/go.mod h1:H5O/EsxDWyU+LP/V8i5sm8cxoZgc2fdNR9bxlOFrQTo=
31+
github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.34 h1:ZNTqv4nIdE/DiBfUUfXcLZ/Spcuz+RjeziUtNJackkM=
32+
github.com/aws/aws-sdk-go-v2/internal/v4a v1.3.34/go.mod h1:zf7Vcd1ViW7cPqYWEHLHJkS50X0JS2IKz9Cgaj6ugrs=
33+
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 h1:eAh2A4b5IzM/lum78bZ590jy36+d/aFLgKF/4Vd1xPE=
34+
github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3/go.mod h1:0yKJC/kb8sAnmlYa6Zs3QVYqaC8ug2AbnNChv5Ox3uA=
35+
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.6.2 h1:t/gZFyrijKuSU0elA5kRngP/oU3mc0I+Dvp8HwRE4c0=
36+
github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.6.2/go.mod h1:iu6FSzgt+M2/x3Dk8zhycdIcHjEFb36IS8HVUVFoMg0=
37+
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 h1:dM9/92u2F1JbDaGooxTq18wmmFzbJRfXfVfy96/1CXM=
38+
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15/go.mod h1:SwFBy2vjtA0vZbjjaFtfN045boopadnoVPhu4Fv66vY=
39+
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.15 h1:moLQUoVq91LiqT1nbvzDukyqAlCv89ZmwaHw/ZFlFZg=
40+
github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.18.15/go.mod h1:ZH34PJUc8ApjBIfgQCFvkWcUDBtl/WTD+uiYHjd8igA=
41+
github.com/aws/aws-sdk-go-v2/service/s3 v1.78.1 h1:1M0gSbyP6q06gl3384wpoKPaH9G16NPqZFieEhLboSU=
42+
github.com/aws/aws-sdk-go-v2/service/s3 v1.78.1/go.mod h1:4qzsZSzB/KiX2EzDjs9D7A8rI/WGJxZceVJIHqtJjIU=
43+
github.com/aws/aws-sdk-go-v2/service/sso v1.25.1 h1:8JdC7Gr9NROg1Rusk25IcZeTO59zLxsKgE0gkh5O6h0=
44+
github.com/aws/aws-sdk-go-v2/service/sso v1.25.1/go.mod h1:qs4a9T5EMLl/Cajiw2TcbNt2UNo/Hqlyp+GiuG4CFDI=
45+
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.29.1 h1:KwuLovgQPcdjNMfFt9OhUd9a2OwcOKhxfvF4glTzLuA=
46+
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.29.1/go.mod h1:MlYRNmYu/fGPoxBQVvBYr9nyr948aY/WLUvwBMBJubs=
47+
github.com/aws/aws-sdk-go-v2/service/sts v1.33.17 h1:PZV5W8yk4OtH1JAuhV2PXwwO9v5G5Aoj+eMCn4T+1Kc=
48+
github.com/aws/aws-sdk-go-v2/service/sts v1.33.17/go.mod h1:cQnB8CUnxbMU82JvlqjKR2HBOm3fe9pWorWBza6MBJ4=
49+
github.com/aws/smithy-go v1.22.2 h1:6D9hW43xKFrRx/tXXfAlIZc4JI+yQe6snnWcQyxSyLQ=
50+
github.com/aws/smithy-go v1.22.2/go.mod h1:irrKGvNn1InZwb2d7fkIRNucdfwR8R+Ts3wxYa/cJHg=
1551
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
1652
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
1753
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=

results.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,12 @@ func CreateResultReporter(ctx context.Context, dsn string, run *Run, res *collec
5252
return func(ctx context.Context, run *Run, res *collector.Response) error {
5353
return ReportResultToWebhook(ctx, config, run, res)
5454
}, err
55+
case "s3":
56+
config, err := newS3ResultReporterConfigFromDSN(dsn)
57+
58+
return func(ctx context.Context, run *Run, res *collector.Response) error {
59+
return ReportResultToS3(ctx, config, run, res)
60+
}, err
5561
case "noop":
5662
return func(ctx context.Context, run *Run, res *collector.Response) error {
5763
return ReportResultToNoop(ctx, nil, run, res)

results_s3.go

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
// Copyright 2024 Factorial GmbH. All rights reserved.
2+
//
3+
// Use of this source code is governed by a BSD-style
4+
// license that can be found in the LICENSE file.
5+
6+
package main
7+
8+
import (
9+
"bytes"
10+
"context"
11+
"crypto/sha256"
12+
"encoding/hex"
13+
"encoding/json"
14+
"fmt"
15+
"log/slog"
16+
"net/url"
17+
"path/filepath"
18+
"strings"
19+
"tobey/internal/collector"
20+
21+
"github.com/aws/aws-sdk-go-v2/aws"
22+
"github.com/aws/aws-sdk-go-v2/config"
23+
"github.com/aws/aws-sdk-go-v2/service/s3"
24+
)
25+
26+
type s3Result struct {
27+
Run string `json:"run"`
28+
RunMetadata interface{} `json:"run_metadata,omitempty"`
29+
RequestURL string `json:"request_url"`
30+
ResponseBody []byte `json:"response_body"` // Will be base64 encoded when JSON marshalled.
31+
ResponseStatusCode int `json:"response_status_code"`
32+
}
33+
34+
type S3ResultReporterConfig struct {
35+
Bucket string
36+
Prefix string
37+
}
38+
39+
func newS3ResultReporterConfigFromDSN(dsn string) (S3ResultReporterConfig, error) {
40+
s3config := S3ResultReporterConfig{}
41+
42+
u, err := url.Parse(dsn)
43+
if err != nil {
44+
return s3config, fmt.Errorf("invalid s3 result reporter DSN: %w", err)
45+
}
46+
47+
// Extract bucket and optional prefix from the path
48+
parts := strings.SplitN(strings.TrimPrefix(u.Path, "/"), "/", 2)
49+
if len(parts) == 0 {
50+
return s3config, fmt.Errorf("bucket name is required in S3 DSN")
51+
}
52+
53+
s3config.Bucket = parts[0]
54+
if len(parts) > 1 {
55+
s3config.Prefix = parts[1]
56+
}
57+
58+
return s3config, nil
59+
}
60+
61+
// ReportResultToS3 stores results in S3 as JSON files. Results are grouped by run
62+
// in a run specific directory. The directory structure is as follows:
63+
//
64+
// <prefix>/<run_uuid>/<url_hash>.json
65+
//
66+
// The <url_hash> is the SHA-256 hash of the request URL, encoded as a hex string.
67+
// The JSON file contains the result as a JSON object.
68+
func ReportResultToS3(ctx context.Context, s3config S3ResultReporterConfig, run *Run, res *collector.Response) error {
69+
logger := slog.With("run", run.ID, "url", res.Request.URL)
70+
logger.Debug("Result reporter: Saving result to S3...")
71+
72+
// Load AWS configuration using aws-sdk-go-v2/config package
73+
awsCfg, err := config.LoadDefaultConfig(ctx)
74+
if err != nil {
75+
return fmt.Errorf("unable to load AWS SDK config: %w", err)
76+
}
77+
78+
// Create S3 client
79+
client := s3.NewFromConfig(awsCfg)
80+
81+
result := &s3Result{
82+
Run: run.ID,
83+
RequestURL: res.Request.URL.String(),
84+
ResponseBody: res.Body[:],
85+
ResponseStatusCode: res.StatusCode,
86+
}
87+
88+
// Generate the object key
89+
hash := sha256.New()
90+
hash.Write([]byte(res.Request.URL.String()))
91+
filename := fmt.Sprintf("%s.json", hex.EncodeToString(hash.Sum(nil)))
92+
93+
// Construct the full S3 key
94+
key := filepath.Join(s3config.Prefix, run.ID, filename)
95+
key = strings.TrimLeft(key, "/") // Remove leading slash as S3 doesn't need it
96+
97+
// Marshal the result to JSON
98+
jsonData, err := json.MarshalIndent(result, "", " ")
99+
if err != nil {
100+
return err
101+
}
102+
103+
// Upload to S3
104+
_, err = client.PutObject(ctx, &s3.PutObjectInput{
105+
Bucket: aws.String(s3config.Bucket),
106+
Key: aws.String(key),
107+
Body: bytes.NewReader(jsonData),
108+
ContentType: aws.String("application/json"),
109+
})
110+
if err != nil {
111+
return fmt.Errorf("failed to upload to S3: %w", err)
112+
}
113+
114+
logger.Debug("Result reporter: Successfully saved result to S3",
115+
"bucket", s3config.Bucket,
116+
"key", key,
117+
)
118+
return nil
119+
}

0 commit comments

Comments
 (0)