diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..a9b7d5c --- /dev/null +++ b/.editorconfig @@ -0,0 +1,21 @@ +root = true + +[*] +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +charset = utf-8 + +[*.go] +indent_style = tab +indent_size = 4 + +[*.{yml,yaml}] +indent_style = space +indent_size = 2 + +[*.md] +trim_trailing_whitespace = false + +[Makefile] +indent_style = tab diff --git a/.env.example b/.env.example index 772a624..ba8f927 100644 --- a/.env.example +++ b/.env.example @@ -1,2 +1,7 @@ -API_URL=http://localhost:3002 -TEST_API_KEY=fc-YOUR-API-KEY +# Firecrawl SDK Runtime (used by your application) +# FIRECRAWL_API_KEY=fc-your-api-key +# FIRECRAWL_API_URL=https://api.firecrawl.dev + +# Integration Tests (used by `make test-integration`) +API_URL=https://api.firecrawl.dev +TEST_API_KEY=fc-your-test-api-key diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..f78906c --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,19 @@ +version: 2 +updates: + - package-ecosystem: gomod + directory: / + schedule: + interval: weekly + open-pull-requests-limit: 5 + labels: + - dependencies + - go + + - package-ecosystem: github-actions + directory: / + schedule: + interval: weekly + open-pull-requests-limit: 5 + labels: + - dependencies + - ci diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..e146d87 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,63 @@ +name: CI +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + - uses: actions/setup-go@v6 + with: + go-version: '1.25' + - uses: golangci/golangci-lint-action@v7 + with: + version: latest + + test: + runs-on: ubuntu-latest + strategy: + matrix: + go-version: ['1.23', '1.24', '1.25'] + steps: + - uses: actions/checkout@v5 + - uses: actions/setup-go@v6 + with: + go-version: ${{ matrix.go-version }} + - run: go test -race -v -count=1 -coverprofile=coverage.out ./... + - name: Check coverage + run: | + COVERAGE=$(go tool cover -func=coverage.out | grep total | awk '{print $3}' | sed 's/%//') + echo "Coverage: ${COVERAGE}%" + if [ "$COVERAGE" = "0.0" ] || [ -z "$COVERAGE" ]; then + echo "No unit tests ran — skipping coverage check" + exit 0 + fi + if (( $(echo "$COVERAGE < 80" | bc -l) )); then + echo "Coverage below 80% threshold" + exit 1 + fi + + integration: + runs-on: ubuntu-latest + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + needs: [lint, test] + steps: + - uses: actions/checkout@v5 + - uses: actions/setup-go@v6 + with: + go-version: '1.25' + - run: go test -race -v -count=1 -tags=integration ./... + env: + API_URL: https://api.firecrawl.dev + TEST_API_KEY: ${{ secrets.FIRECRAWL_API_KEY }} diff --git a/.gitignore b/.gitignore index db27dc8..853afeb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,6 @@ .env -vendor \ No newline at end of file +coverage.out +coverage.html +vendor/ +*.test +*.prof diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 0000000..21f00a1 --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,31 @@ +version: "2" + +run: + timeout: 5m + +formatters: + enable: + - gofumpt + +linters: + enable: + - errcheck + - govet + - staticcheck + - unused + - ineffassign + - misspell + - bodyclose + - noctx + - gosec + - prealloc + + settings: + errcheck: + check-type-assertions: true + govet: + disable: + - fieldalignment + gosec: + excludes: + - G402 # TLS InsecureSkipVerify (user controls this) diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..cb7e73f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,91 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- Search endpoint (`POST /v2/search`) with typed `SearchResponse` (IMP-01) +- Batch Scrape endpoints: `BatchScrapeURLs`, `AsyncBatchScrapeURLs`, `CheckBatchScrapeStatus` (IMP-02) +- Extract endpoints: `Extract`, `AsyncExtract`, `CheckExtractStatus` (IMP-03) +- Typed error system: `APIError` struct with 8 sentinel errors (`ErrUnauthorized`, `ErrRateLimited`, `ErrNoAPIKey`, `ErrPaymentRequired`, `ErrNotFound`, `ErrTimeout`, `ErrConflict`, `ErrServerError`) (IMP-04) +- Security hardening: pagination URL validation against API host, UUID job ID validation, HTTPS warning on non-localhost HTTP (IMP-05) +- Unit test foundation with `httptest.NewServer` mock server helpers (IMP-06) +- 160+ unit tests covering all methods, error paths, and security behaviors (IMP-07, IMP-08) +- HTTP client options: `NewFirecrawlAppWithOptions`, `WithTimeout`, `WithTransport`, `WithUserAgent`, `WithMaxIdleConns`, `WithMaxIdleConnsPerHost` (IMP-15) +- `PaginationConfig` support for `CheckCrawlStatus` and `CheckBatchScrapeStatus` (IMP-10) +- `GetCrawlStatusPage` and `GetBatchScrapeStatusPage` public methods for manual pagination (IMP-10) +- `SDKVersion` constant (`"2.0.0"`) and `User-Agent` header on all requests (IMP-15) +- `CONTRIBUTING.md` with development workflow, code style, and endpoint addition guide (IMP-11) +- Integration tests for Search, Batch Scrape, Extract, and PaginationConfig (IMP-09) + +### Changed + +- **BREAKING:** All public methods now require `context.Context` as first parameter (MIG-05) +- **BREAKING:** `CrawlParams.MaxDepth` renamed to `MaxDiscoveryDepth` (MIG-04) +- **BREAKING:** `CrawlParams.AllowBackwardLinks` renamed to `CrawlEntireDomain` (MIG-04) +- **BREAKING:** `CrawlParams.IgnoreSitemap` replaced by `Sitemap` string enum (`"include"`, `"skip"`, `"only"`) (MIG-04) +- **BREAKING:** `CrawlParams.Webhook` changed from `*string` to `*WebhookConfig` (MIG-04) +- **BREAKING:** `MapResponse.Links` changed from `[]string` to `[]MapLink` (MIG-04) +- **BREAKING:** `ScrapeParams.ParsePDF` removed, replaced by `Parsers []ParserConfig` (MIG-04) +- **BREAKING:** `FirecrawlApp.APIKey` field unexported — use `APIKey()` accessor method (IMP-05) +- **BREAKING:** `Search` method signature changed from `(ctx, query, *any) (any, error)` to `(ctx, query, *SearchParams) (*SearchResponse, error)` (IMP-01) +- All endpoints migrated from `/v1/*` to `/v2/*` (MIG-06 through MIG-09) +- `makeRequest` accepts `[]byte` body instead of `map[string]any`; callers marshal before passing (MIG-06) +- `monitorJobStatus` uses v2 status values: `"scraping"` (poll), `"completed"`, `"failed"` (MIG-08) +- Minimum Go version bumped from 1.22 to 1.23 (MIG-04) +- Split monolithic `firecrawl.go` into 16 modular files (MIG-02) +- `http.DefaultTransport` is cloned instead of referenced directly (IMP-15) + +### Fixed + +- Retry counter in `monitorJobStatus` was initialized at retry threshold — now starts at 0 so retries actually occur (MIG-01) +- `defer resp.Body.Close()` inside retry loop leaked HTTP connections; intermediate bodies now closed explicitly (MIG-01) +- Request body (`bytes.NewBuffer`) consumed on first attempt, all retries sent empty body; body now recreated per attempt (MIG-01) +- `ScrapeURL` checked response `Success` before checking unmarshal error — order corrected (MIG-01) +- `ScrapeOptions` gate only checked `Formats` field — gate now checks any non-zero field (MIG-01) + +### Removed + +- Commented-out v0 extractor code (MIG-01) +- Legacy `firecrawl_test.go_V0` test file (MIG-03) +- v1 API paths (`/v1/*`) — all replaced by `/v2/*` + +## [2.0.0] — 2026-03-15 + +### Added + +- `context.Context` on all public methods and internal helpers (MIG-05) +- 31+ v2 type definitions: `LocationConfig`, `WebhookConfig`, `ActionConfig`, `ParserConfig`, `MapLink`, `PaginationConfig`, `SearchParams`, `SearchResponse`, `BatchScrapeParams`, `BatchScrapeResponse`, `ExtractParams`, `ExtractResponse`, and more (MIG-04) +- CI/CD pipeline: `Makefile` with 9 targets, `golangci-lint` v2 config, GitHub Actions with lint + test matrix (Go 1.23/1.24/1.25) (MIG-03) +- Modular file structure: 16 Go source files split by concern (MIG-02) +- `.editorconfig` and `dependabot.yml` (MIG-03) + +### Changed + +- All endpoints migrated to `/v2/*` paths (MIG-06 through MIG-09) +- Request bodies use typed struct marshaling instead of `map[string]any` (MIG-11) +- `monitorJobStatus` updated for v2 status values: `"scraping"`, `"completed"`, `"failed"` (MIG-08) +- Crawl parameters updated: `MaxDepth` → `MaxDiscoveryDepth`, `IgnoreSitemap` → `Sitemap`, `AllowBackwardLinks` → `CrawlEntireDomain` (MIG-07) +- `MapResponse.Links` changed from `[]string` to `[]MapLink` (MIG-09) +- `.env.example` updated to use live API URL (MIG-03) + +### Fixed + +- Retry counter starting at threshold instead of 0 (MIG-01) +- `defer resp.Body.Close()` connection leak in retry loop (MIG-01) +- Request body reuse across retries sending empty body (MIG-01) +- Error handling order in `ScrapeURL` — unmarshal error checked before `Success` (MIG-01) +- `ScrapeOptions` gate missing nil check on non-Formats fields (MIG-01) + +### Removed + +- v1 field names: `MaxDepth`, `AllowBackwardLinks`, `IgnoreSitemap` from `CrawlParams` (MIG-07) +- Dead v0 extractor code and legacy test file (MIG-01, MIG-03) + +[Unreleased]: https://github.com/firecrawl/firecrawl-go/compare/v2.0.0...HEAD +[2.0.0]: https://github.com/firecrawl/firecrawl-go/releases/tag/v2.0.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..8a83a55 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,71 @@ +# Contributing to firecrawl-go + +Thank you for your interest in contributing! + +## Quick Start + +```bash +git clone git@github.com:firecrawl/firecrawl-go.git +cd firecrawl-go +go mod download +make check # lint + vet + test +``` + +## Development Workflow + +1. Fork the repository and create a feature branch from `main`. +2. Make your changes following the code style below. +3. Run `make check` before committing (lint + vet + unit tests). +4. Push and open a pull request with a clear description of what changed and why. + +A pre-commit hook runs `make check` automatically on every commit. + +## Code Style + +- Format with `gofumpt`: `make fmt` +- Lint with `golangci-lint` v2: `make lint` +- Vet with `go vet`: `make vet` +- All public methods require `context.Context` as the first parameter. +- Optional request fields use pointer types with `json:",omitempty"`. +- Use typed request structs (internal, unexported) with `json.Marshal` for POST endpoints. +- Follow conventional commit format: `feat(scope): description`, `fix(scope): description`, `docs: description`. + +## Testing + +| Command | What It Runs | API Key? | +|---------|-------------|----------| +| `make test` | 160 unit tests (httptest mocks) | No | +| `make test-integration` | 32 E2E tests (live Firecrawl API) | Yes | +| `make coverage` | HTML coverage report | No | + +Unit tests run against `httptest.NewServer` mock servers — no `.env` file or API key needed. If unit tests fail, the issue is in the code, not missing credentials. + +For integration tests: + +```bash +cp .env.example .env +# Edit .env: +# API_URL=https://api.firecrawl.dev +# TEST_API_KEY=fc-your-api-key +make test-integration +``` + +Integration tests consume API credits. + +## Prerequisites + +| Tool | Version | Installation | +|------|---------|-------------| +| Go | 1.23+ | [go.dev/dl](https://go.dev/dl/) | +| golangci-lint | v2.x | `go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@latest` | +| gofumpt | latest | `go install mvdan.cc/gofumpt@latest` | + +## Adding a New Endpoint + +1. Define request/response types in `types.go` with full godoc comments. +2. Create a new file `.go` with the public method(s). +3. Add a corresponding `_test.go` with unit tests using `httptest.NewServer`. +4. If the endpoint is async with polling, add E2E tests in `firecrawl_test.go` (build tag: `integration`). +5. Run `make check` to verify everything passes. + +Every exported symbol must have a godoc comment. Public methods must document all parameters, return values, and any error conditions. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..b1a4097 --- /dev/null +++ b/Makefile @@ -0,0 +1,34 @@ +.DEFAULT_GOAL := help +.PHONY: help build test test-integration lint fmt vet coverage clean check + +help: ## Show this help + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | \ + awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' + +build: ## Compile the library + go build ./... + +test: ## Run unit tests (no API key needed) + go test -race -v -count=1 ./... + +test-integration: ## Run integration tests (requires .env with API key) + go test -race -v -count=1 -tags=integration ./... + +lint: ## Run golangci-lint + golangci-lint run + +fmt: ## Format code with gofumpt + gofumpt -w . + +vet: ## Run go vet + go vet ./... + +coverage: ## Generate HTML coverage report + go test -coverprofile=coverage.out -covermode=atomic ./... + go tool cover -html=coverage.out -o coverage.html + @echo "Coverage report: coverage.html" + +clean: ## Remove generated files + rm -f coverage.out coverage.html + +check: lint vet test ## Run all checks (lint + vet + test) diff --git a/README.md b/README.md index 086db7f..f54f7fd 100644 --- a/README.md +++ b/README.md @@ -1,186 +1,499 @@ -# Firecrawl Go SDK +# firecrawl-go v2 -The Firecrawl Go SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API. +Go SDK for the [Firecrawl](https://firecrawl.dev) v2 API. Scrape, crawl, map, search, batch-scrape, and extract structured data from websites — with output formatted for LLMs. -## Installation +> **Fork of [firecrawl/firecrawl-go](https://github.com/ArmandoHerra/firecrawl-go)** — migrated to Firecrawl API v2 with typed request structs, `context.Context` on every method, typed errors, security hardening, functional client options, and a modern CI pipeline. -To install the Firecrawl Go SDK, you can +## Installation ```bash -go get github.com/mendableai/firecrawl-go/v2 +go get github.com/firecrawl/firecrawl-go/v2 ``` -## Usage - -1. Get an API key from [firecrawl.dev](https://firecrawl.dev) -2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. +Requires Go 1.23+. - -Here's an example of how to use the SDK with error handling: +## Quick Start ```go package main import ( - "encoding/json" + "context" "fmt" "log" - "github.com/mendableai/firecrawl-go/v2" + firecrawl "github.com/firecrawl/firecrawl-go/v2" ) func main() { - // Initialize the FirecrawlApp with your API key and optional URL - app, err := firecrawl.NewFirecrawlApp("YOUR_API_KEY", "YOUR_API_URL") + app, err := firecrawl.NewFirecrawlApp("YOUR_API_KEY", "") if err != nil { - log.Fatalf("Failed to initialize FirecrawlApp: %v", err) + log.Fatal(err) } - // Scrape a single URL - scrapeResult, err := app.ScrapeURL("example.com", nil) - if err != nil { - log.Fatalf("Failed to scrape URL: %v", err) - } - fmt.Println(scrapeResult.Markdown) - - // Crawl a website - idempotencyKey := "idempotency-key" // optional idempotency key - crawlParams := &firecrawl.CrawlParams{ - ExcludePaths: []string{"blog/*"}, - MaxDepth: prt(2), - } - crawlResult, err := app.CrawlURL("example.com", crawlParams, &idempotencyKey) - if err != nil { - log.Fatalf("Failed to crawl URL: %v", err) - } - jsonCrawlResult, err := json.MarshalIndent(crawlResult, "", " ") + doc, err := app.ScrapeURL(context.Background(), "https://example.com", nil) if err != nil { - log.Fatalf("Failed to marshal crawl result: %v", err) + log.Fatal(err) } - fmt.Println(string(jsonCrawlResult)) + fmt.Println(doc.Markdown) } ``` -### Scraping a URL +## API Methods + +All methods accept `context.Context` as the first parameter for cancellation and deadlines. + +### Scrape + +| Method | Endpoint | Description | +|--------|----------|-------------| +| `ScrapeURL(ctx, url, params)` | `POST /v2/scrape` | Scrape a single URL, returns markdown/HTML/JSON/screenshot | + +### Crawl + +| Method | Endpoint | Description | +|--------|----------|-------------| +| `CrawlURL(ctx, url, params, key, pollInterval...)` | `POST /v2/crawl` | Start a crawl and poll until complete | +| `AsyncCrawlURL(ctx, url, params, key)` | `POST /v2/crawl` | Start an async crawl, returns job ID | +| `CheckCrawlStatus(ctx, id, pagination...)` | `GET /v2/crawl/{id}` | Check status; optional auto-pagination | +| `GetCrawlStatusPage(ctx, nextURL)` | `GET /v2/crawl/{id}?cursor=...` | Fetch one page manually (for manual pagination) | +| `CancelCrawlJob(ctx, id)` | `DELETE /v2/crawl/{id}` | Cancel a running crawl | + +### Map + +| Method | Endpoint | Description | +|--------|----------|-------------| +| `MapURL(ctx, url, params)` | `POST /v2/map` | Discover all URLs on a site, returns `[]MapLink` | + +### Search -To scrape a single URL with error handling, use the `ScrapeURL` method. It takes the URL as a parameter and returns the scraped data as a dictionary. +| Method | Endpoint | Description | +|--------|----------|-------------| +| `Search(ctx, query, params)` | `POST /v2/search` | Web/image/news search with optional content scraping | + +### Batch Scrape + +| Method | Endpoint | Description | +|--------|----------|-------------| +| `BatchScrapeURLs(ctx, urls, params, key, pollInterval...)` | `POST /v2/batch/scrape` | Scrape multiple URLs, poll until complete | +| `AsyncBatchScrapeURLs(ctx, urls, params, key)` | `POST /v2/batch/scrape` | Start batch scrape async, returns job ID | +| `CheckBatchScrapeStatus(ctx, id, pagination...)` | `GET /v2/batch/scrape/{id}` | Check status; optional auto-pagination | +| `GetBatchScrapeStatusPage(ctx, nextURL)` | `GET /v2/batch/scrape/{id}?cursor=...` | Fetch one page manually | + +### Extract + +| Method | Endpoint | Description | +|--------|----------|-------------| +| `Extract(ctx, urls, params)` | `POST /v2/extract` | LLM-based structured extraction, poll until complete | +| `AsyncExtract(ctx, urls, params)` | `POST /v2/extract` | Start extraction async, returns job ID | +| `CheckExtractStatus(ctx, id)` | `GET /v2/extract/{id}` | Check extraction job status | + +## Usage Examples + +### Scrape with Options ```go -url := "https://example.com" -scrapedData, err := app.ScrapeURL(url, nil) +func ptr[T any](v T) *T { return &v } + +doc, err := app.ScrapeURL(ctx, "https://example.com", &firecrawl.ScrapeParams{ + Formats: []string{"markdown", "html"}, + OnlyMainContent: ptr(true), + Mobile: ptr(true), + BlockAds: ptr(true), + Location: &firecrawl.LocationConfig{Country: "US", Languages: []string{"en"}}, +}) if err != nil { - log.Fatalf("Failed to scrape URL: %v", err) + log.Fatal(err) } -fmt.Println(scrapedData) +fmt.Println(doc.Markdown) ``` -### Extracting structured data from a URL - -With LLM extraction, you can easily extract structured data from any URL. Here is how you to use it: +### Crawl a Website (Synchronous) ```go -jsonSchema := map[string]any{ - "type": "object", - "properties": map[string]any{ - "top": map[string]any{ - "type": "array", - "items": map[string]any{ - "type": "object", - "properties": map[string]any{ - "title": map[string]string{"type": "string"}, - "points": map[string]string{"type": "number"}, - "by": map[string]string{"type": "string"}, - "commentsURL": map[string]string{"type": "string"}, - }, - "required": []string{"title", "points", "by", "commentsURL"}, - }, - "minItems": 5, - "maxItems": 5, - "description": "Top 5 stories on Hacker News", - }, - }, - "required": []string{"top"}, +result, err := app.CrawlURL(ctx, "https://example.com", &firecrawl.CrawlParams{ + Limit: ptr(100), + MaxDiscoveryDepth: ptr(3), + CrawlEntireDomain: ptr(true), + Sitemap: ptr("include"), + ExcludePaths: []string{"blog/*"}, +}, nil) // nil idempotency key +if err != nil { + log.Fatal(err) } +fmt.Printf("Scraped %d pages\n", len(result.Data)) +``` -llmExtractionParams := map[string]any{ - "extractorOptions": firecrawl.ExtractorOptions{ - ExtractionSchema: jsonSchema, - }, +### Async Crawl with Context Timeout + +```go +ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) +defer cancel() + +crawlResp, err := app.AsyncCrawlURL(ctx, "https://example.com", nil, nil) +if err != nil { + log.Fatal(err) } -scrapeResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams) +// Check status (single page) +status, err := app.CheckCrawlStatus(ctx, crawlResp.ID) if err != nil { - log.Fatalf("Failed to perform LLM extraction: %v", err) + log.Fatal(err) } -fmt.Println(scrapeResult) +fmt.Printf("Status: %s, Pages: %d/%d\n", status.Status, status.Completed, status.Total) ``` -### Crawling a Website +### Search -To crawl a website, use the `CrawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. +```go +results, err := app.Search(ctx, "go generics tutorial", &firecrawl.SearchParams{ + Limit: ptr(5), + Country: ptr("US"), + Sources: []string{"web", "news"}, +}) +if err != nil { + log.Fatal(err) +} +for _, r := range results.Data.Web { + fmt.Printf("%s — %s\n", r.Title, r.URL) +} +``` + +### Batch Scrape (Synchronous) ```go -response, err := app.CrawlURL("https://roastmywebsite.ai", nil,nil) +urls := []string{ + "https://example.com", + "https://example.org", + "https://example.net", +} +result, err := app.BatchScrapeURLs(ctx, urls, &firecrawl.BatchScrapeParams{ + ScrapeOptions: firecrawl.ScrapeParams{ + Formats: []string{"markdown"}, + OnlyMainContent: ptr(true), + }, + MaxConcurrency: ptr(5), +}, nil) // nil idempotency key if err != nil { - log.Fatalf("Failed to crawl URL: %v", err) + log.Fatal(err) } - -fmt.Println(response) +fmt.Printf("Scraped %d URLs\n", len(result.Data)) ``` -### Asynchronous Crawl - -To initiate an asynchronous crawl of a website, utilize the `AsyncCrawlURL` method. This method requires the starting URL and optional parameters as inputs. The `params` argument enables you to define various settings for the asynchronous crawl, such as the maximum number of pages to crawl, permitted domains, and the output format. Upon successful initiation, this method returns an ID, which is essential for subsequently checking the status of the crawl. +### Async Batch Scrape with Manual Pagination ```go -response, err := app.AsyncCrawlURL("https://roastmywebsite.ai", nil, nil) +batchResp, err := app.AsyncBatchScrapeURLs(ctx, urls, nil, nil) +if err != nil { + log.Fatal(err) +} +// Check status — get first page +status, err := app.CheckBatchScrapeStatus(ctx, batchResp.ID) if err != nil { - log.Fatalf("Failed to crawl URL: %v", err) + log.Fatal(err) } -fmt.Println(response) +// Manually iterate pages +for status.Next != nil { + status, err = app.GetBatchScrapeStatusPage(ctx, *status.Next) + if err != nil { + log.Fatal(err) + } + fmt.Printf("Page data: %d results\n", len(status.Data)) +} ``` +### Extract Structured Data + +```go +schema := map[string]any{ + "type": "object", + "properties": map[string]any{ + "company_name": map[string]any{"type": "string"}, + "founded": map[string]any{"type": "integer"}, + "employees": map[string]any{"type": "integer"}, + }, +} -### Checking Crawl Status +result, err := app.Extract(ctx, []string{"https://example.com/about"}, &firecrawl.ExtractParams{ + Prompt: ptr("Extract company information including name, founding year, and employee count."), + Schema: schema, +}) +if err != nil { + log.Fatal(err) +} +fmt.Printf("Extracted: %v\n", result.Data) +``` -To check the status of a crawl job, use the `CheckCrawlStatus` method. It takes the crawl ID as a parameter and returns the current status of the crawl job. +### Map a Website ```go -status, err := app.CheckCrawlStatus(id) +mapResp, err := app.MapURL(ctx, "https://example.com", &firecrawl.MapParams{ + Limit: ptr(5000), + Sitemap: ptr("include"), +}) if err != nil { - log.Fatalf("Failed to check crawl status: %v", err) + log.Fatal(err) +} +for _, link := range mapResp.Links { + fmt.Printf("%s\n", link.URL) } -fmt.Println(status) ``` -### Canceling a Crawl Job -To cancel a crawl job, use the `CancelCrawlJob` method. It takes the job ID as a parameter and returns the cancellation status of the crawl job. +## Pagination + +For large crawls and batch scrapes, the API returns paginated results with a `Next` URL. + +### Auto-Pagination (Recommended) + +Pass a `PaginationConfig` to `CheckCrawlStatus` or `CheckBatchScrapeStatus` to automatically collect all pages: ```go -canceled, err := app.CancelCrawlJob(jobId) -if err != nil { - log.Fatalf("Failed to cancel crawl job: %v", err) +result, err := app.CheckCrawlStatus(ctx, crawlID, &firecrawl.PaginationConfig{ + AutoPaginate: ptr(true), + MaxPages: ptr(10), // stop after 10 pages + MaxResults: ptr(1000), // stop after 1000 total results + MaxWaitTime: ptr(60), // stop after 60 seconds +}) +``` + +### Manual Pagination + +Use `GetCrawlStatusPage` / `GetBatchScrapeStatusPage` to fetch one page at a time: + +```go +status, err := app.CheckCrawlStatus(ctx, crawlID) +for status.Next != nil { + status, err = app.GetCrawlStatusPage(ctx, *status.Next) + if err != nil { + break + } + // process status.Data for this page } -fmt.Println(canceled) ``` ## Error Handling -The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. +The SDK uses typed errors enabling `errors.Is` and `errors.As` for programmatic handling. -## Contributing +### Sentinel Errors -Contributions to the Firecrawl Go SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. +| Sentinel | HTTP Status | Meaning | +|----------|-------------|---------| +| `ErrNoAPIKey` | — | No API key provided to constructor | +| `ErrUnauthorized` | 401 | Invalid or expired API key | +| `ErrPaymentRequired` | 402 | Account credit limit reached | +| `ErrNotFound` | 404 | Resource not found | +| `ErrTimeout` | 408 | Request timed out | +| `ErrConflict` | 409 | Conflicting operation (e.g., duplicate idempotency key) | +| `ErrRateLimited` | 429 | Rate limit exceeded | +| `ErrServerError` | 500 | Internal server error | -## License +### errors.Is — Check Error Type + +```go +_, err := app.ScrapeURL(ctx, url, nil) +if errors.Is(err, firecrawl.ErrRateLimited) { + time.Sleep(5 * time.Second) + // retry... +} +if errors.Is(err, firecrawl.ErrUnauthorized) { + log.Fatal("Check your API key") +} +``` + +### errors.As — Access Full Error Details + +```go +var apiErr *firecrawl.APIError +if errors.As(err, &apiErr) { + log.Printf("HTTP %d during %s: %s", apiErr.StatusCode, apiErr.Action, apiErr.Message) +} +``` + +## Configuration + +### Default Constructor + +```go +app, err := firecrawl.NewFirecrawlApp("fc-your-api-key", "") +// API URL defaults to https://api.firecrawl.dev +// Timeout defaults to 120 seconds +``` + +Falls back to environment variables if arguments are empty: +- `FIRECRAWL_API_KEY` — API key +- `FIRECRAWL_API_URL` — API base URL + +### Functional Options Constructor + +```go +app, err := firecrawl.NewFirecrawlAppWithOptions( + "fc-your-api-key", + "", + firecrawl.WithTimeout(30*time.Second), + firecrawl.WithUserAgent("my-app/1.0"), +) +``` + +Available options: + +| Option | Default | Description | +|--------|---------|-------------| +| `WithTimeout(d)` | 120s | HTTP client timeout | +| `WithTransport(t)` | `http.DefaultTransport` clone | Custom HTTP transport | +| `WithUserAgent(ua)` | `firecrawl-go/2.0.0` | User-Agent header | +| `WithMaxIdleConns(n)` | 100 | Max idle keep-alive connections | +| `WithMaxIdleConnsPerHost(n)` | 10 | Max idle connections per host | + +### API Key Access + +The `apiKey` field is unexported. Use the `APIKey()` accessor method: + +```go +fmt.Println(app.APIKey()) // "fc-abc...xyz" +fmt.Println(app.String()) // "FirecrawlApp{url: ..., key: fc-a...xyz}" (redacted) +``` + +## Security + +- **API key unexported** — the `apiKey` field is unexported; use `APIKey()` to read it. `String()` returns a redacted representation. +- **URL validation** — all job IDs are validated as UUIDs before being interpolated into request paths, preventing path injection attacks. +- **Pagination SSRF prevention** — `Next` URLs from the API are validated to share the same host as the configured `APIURL` before any request is made. +- **HTTP warning** — a log warning is emitted when a non-localhost HTTP (non-TLS) URL is used, because the API key would be sent in cleartext. -The Firecrawl Go SDK is licensed under the MIT License. This means you are free to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the SDK, subject to the following conditions: +## Tech Stack -- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +| Technology | Version | Purpose | +|-----------|---------|---------| +| Go | 1.23+ | Language runtime | +| golangci-lint | v2.x | Linting (errcheck, govet, staticcheck, gosec, etc.) | +| gofumpt | latest | Code formatting | +| GitHub Actions | — | CI: lint + test matrix (Go 1.23/1.24/1.25) | +| testify | v1.10 | Test assertions (integration tests) | -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +## Project Structure + +``` +firecrawl-go/ +├── client.go # FirecrawlApp struct, NewFirecrawlApp, NewFirecrawlAppWithOptions +├── client_options.go # ClientOption type, WithTimeout, WithTransport, WithUserAgent, etc. +├── types.go # All request/response type definitions (35+ v2 types) +├── scrape.go # ScrapeURL — POST /v2/scrape +├── crawl.go # CrawlURL, AsyncCrawlURL, CheckCrawlStatus, GetCrawlStatusPage, CancelCrawlJob +├── map.go # MapURL — POST /v2/map +├── search.go # Search — POST /v2/search +├── batch.go # BatchScrapeURLs, AsyncBatchScrapeURLs, CheckBatchScrapeStatus, GetBatchScrapeStatusPage +├── extract.go # Extract, AsyncExtract, CheckExtractStatus +├── errors.go # APIError, sentinel errors (ErrUnauthorized, ErrRateLimited, etc.) +├── security.go # validateJobID, validatePaginationURL +├── helpers.go # makeRequest, monitorJobStatus — internal HTTP + polling +├── options.go # requestOptions, withRetries, withBackoff — internal retry config +├── firecrawl.go # Package doc comment +├── client_test.go # Unit tests: constructor, options, security +├── scrape_test.go # Unit tests: ScrapeURL +├── crawl_test.go # Unit tests: CrawlURL, CheckCrawlStatus, pagination +├── map_test.go # Unit tests: MapURL +├── search_test.go # Unit tests: Search +├── batch_test.go # Unit tests: BatchScrapeURLs, CheckBatchScrapeStatus, pagination +├── extract_test.go # Unit tests: Extract, CheckExtractStatus +├── errors_test.go # Unit tests: APIError, sentinel errors, Unwrap +├── helpers_test.go # Unit tests: makeRequest, retry logic +├── security_test.go # Unit tests: validateJobID, validatePaginationURL +├── types_test.go # Unit tests: StringOrStringSlice JSON unmarshaling +├── testhelpers_test.go # Shared test helpers: ptr[T](), test server setup +├── firecrawl_test.go # Integration/E2E tests (//go:build integration, 32 tests) +├── Makefile # build, test, test-integration, lint, fmt, vet, coverage, check +├── .golangci.yml # golangci-lint v2 configuration +├── .github/ +│ ├── workflows/ci.yml # CI pipeline (lint + test matrix + integration) +│ └── dependabot.yml # Automated dependency updates +├── .editorconfig # Editor settings +├── .env.example # Environment template for integration tests +├── go.mod / go.sum # Module: github.com/firecrawl/firecrawl-go/v2 +├── CHANGELOG.md # Keep a Changelog format — all migration and improvement changes +└── LICENSE # MIT +``` + +## Available Commands + +| Command | Description | +|---------|-------------| +| `make help` | Show all available targets | +| `make build` | Compile the library | +| `make test` | Run unit tests (no API key needed, 160 tests) | +| `make test-integration` | Run integration/E2E tests (requires `.env`) | +| `make lint` | Run golangci-lint | +| `make fmt` | Format code with gofumpt | +| `make vet` | Run go vet | +| `make coverage` | Generate HTML coverage report | +| `make clean` | Remove generated files | +| `make check` | Run lint + vet + test (full pre-commit check) | + +## Testing + +### Unit Tests (no API key needed) + +```bash +make test +# or: go test -race -v -count=1 ./... +``` + +160 unit tests run using `httptest.NewServer` mock servers. No `.env` or API key required. + +### Integration Tests (live API) + +```bash +cp .env.example .env +# Edit .env: +# API_URL=https://api.firecrawl.dev +# TEST_API_KEY=fc-your-api-key +make test-integration +# or: go test -race -v -count=1 -tags=integration ./... +``` + +32 E2E tests hit the live Firecrawl v2 API. These consume API credits. + +### Environment Variables + +| Variable | Used By | Required For | +|----------|---------|-------------| +| `FIRECRAWL_API_KEY` | SDK runtime | Production use (constructor fallback) | +| `FIRECRAWL_API_URL` | SDK runtime | Custom API URL (defaults to `https://api.firecrawl.dev`) | +| `TEST_API_KEY` | Integration tests | `make test-integration` | +| `API_URL` | Integration tests | `make test-integration` | + +## Development + +### Prerequisites + +- Go 1.23+ +- golangci-lint v2 (`go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@latest`) +- gofumpt (`go install mvdan.cc/gofumpt@latest`) + +### Setup + +```bash +git clone git@github.com:firecrawl/firecrawl-go.git +cd firecrawl-go +go mod download +make check # lint + vet + test +``` + +### Development Loop + +```bash +make fmt # Format with gofumpt +make check # Lint + vet + all unit tests +# Commit — pre-commit hook runs make check automatically +``` + +## Contributing + +See [CONTRIBUTING.md](CONTRIBUTING.md) for setup, code style, and pull request guidelines. + +## License -Please note that while this SDK is MIT licensed, it is part of a larger project which may be under different licensing terms. Always refer to the license information in the root directory of the main project for overall licensing details. +MIT License. See [LICENSE](LICENSE) for details. diff --git a/batch.go b/batch.go new file mode 100644 index 0000000..7174c3d --- /dev/null +++ b/batch.go @@ -0,0 +1,361 @@ +package firecrawl + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "time" +) + +// batchScrapeRequest is the internal request struct for batch scrape operations. +// It is unexported — callers use BatchScrapeParams instead. +type batchScrapeRequest struct { + URLs []string `json:"urls"` + ScrapeOptions *ScrapeParams `json:"scrapeOptions,omitempty"` + MaxConcurrency *int `json:"maxConcurrency,omitempty"` + IgnoreInvalidURLs *bool `json:"ignoreInvalidURLs,omitempty"` + Webhook *WebhookConfig `json:"webhook,omitempty"` +} + +// AsyncBatchScrapeURLs starts a batch scrape job asynchronously. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - urls: The list of URLs to scrape. +// - params: Optional parameters for the batch scrape request. +// - idempotencyKey: An optional idempotency key (can be nil). +// +// Returns: +// - *BatchScrapeResponse: The response with job ID for polling. +// - error: An error if starting the batch scrape fails. +func (app *FirecrawlApp) AsyncBatchScrapeURLs(ctx context.Context, urls []string, params *BatchScrapeParams, idempotencyKey *string) (*BatchScrapeResponse, error) { + headers := app.prepareHeaders(idempotencyKey) + + req := batchScrapeRequest{URLs: urls} + if params != nil { + // Only include ScrapeOptions if at least one field is set. + scrapeOpts := params.ScrapeOptions + if scrapeOpts.Formats != nil || scrapeOpts.Headers != nil || scrapeOpts.IncludeTags != nil || + scrapeOpts.ExcludeTags != nil || scrapeOpts.OnlyMainContent != nil || scrapeOpts.WaitFor != nil || + scrapeOpts.Timeout != nil || scrapeOpts.MaxAge != nil || scrapeOpts.MinAge != nil || + scrapeOpts.JsonOptions != nil || scrapeOpts.Mobile != nil || scrapeOpts.SkipTlsVerification != nil || + scrapeOpts.BlockAds != nil || scrapeOpts.Proxy != nil || scrapeOpts.Location != nil || + scrapeOpts.Parsers != nil || scrapeOpts.Actions != nil || scrapeOpts.RemoveBase64Images != nil || + scrapeOpts.StoreInCache != nil || scrapeOpts.ZeroDataRetention != nil { + req.ScrapeOptions = &scrapeOpts + } + req.MaxConcurrency = params.MaxConcurrency + req.IgnoreInvalidURLs = params.IgnoreInvalidURLs + req.Webhook = params.Webhook + } + + body, err := json.Marshal(req) + if err != nil { + return nil, fmt.Errorf("failed to marshal batch scrape request: %w", err) + } + + resp, err := app.makeRequest( + ctx, + http.MethodPost, + fmt.Sprintf("%s/v2/batch/scrape", app.APIURL), + body, + headers, + "start batch scrape job", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + var batchResponse BatchScrapeResponse + if err := json.Unmarshal(resp, &batchResponse); err != nil { + return nil, fmt.Errorf("failed to parse batch scrape response: %w", err) + } + + if batchResponse.ID == "" { + return nil, fmt.Errorf("failed to get batch scrape job ID") + } + + return &batchResponse, nil +} + +// BatchScrapeURLs starts a batch scrape job and polls until completion. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - urls: The list of URLs to scrape. +// - params: Optional parameters for the batch scrape request. +// - idempotencyKey: An optional idempotency key (can be nil). +// - pollInterval: An optional interval (in seconds) at which to poll. Default is 2 seconds. +// +// Returns: +// - *BatchScrapeStatusResponse: The batch scrape result with all scraped documents. +// - error: An error if the batch scrape fails. +func (app *FirecrawlApp) BatchScrapeURLs(ctx context.Context, urls []string, params *BatchScrapeParams, idempotencyKey *string, pollInterval ...int) (*BatchScrapeStatusResponse, error) { + response, err := app.AsyncBatchScrapeURLs(ctx, urls, params, idempotencyKey) + if err != nil { + return nil, err + } + + actualPollInterval := 2 + if len(pollInterval) > 0 { + actualPollInterval = pollInterval[0] + } + + headers := app.prepareHeaders(nil) + return app.monitorBatchScrapeStatus(ctx, response.ID, headers, actualPollInterval) +} + +// CheckBatchScrapeStatus checks the status of a batch scrape job. +// +// When a PaginationConfig is provided with AutoPaginate enabled, it automatically +// follows Next URLs to collect all results, respecting MaxPages, MaxResults, and +// MaxWaitTime limits. Without PaginationConfig, only the first page is returned. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - id: The ID of the batch scrape job to check. +// - pagination: An optional PaginationConfig to control auto-pagination behavior. +// +// Returns: +// - *BatchScrapeStatusResponse: The current status of the batch scrape job (possibly spanning multiple pages). +// - error: An error if the status check fails. +func (app *FirecrawlApp) CheckBatchScrapeStatus(ctx context.Context, id string, pagination ...*PaginationConfig) (*BatchScrapeStatusResponse, error) { + if err := validateJobID(id); err != nil { + return nil, err + } + + headers := app.prepareHeaders(nil) + + resp, err := app.makeRequest( + ctx, + http.MethodGet, + fmt.Sprintf("%s/v2/batch/scrape/%s", app.APIURL, id), + nil, + headers, + "check batch scrape status", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + var statusResponse BatchScrapeStatusResponse + if err := json.Unmarshal(resp, &statusResponse); err != nil { + return nil, fmt.Errorf("failed to parse batch scrape status response: %w", err) + } + + // Without PaginationConfig or AutoPaginate disabled, return the single page. + if len(pagination) == 0 || pagination[0] == nil || pagination[0].AutoPaginate == nil || !*pagination[0].AutoPaginate { + return &statusResponse, nil + } + + return app.autoPaginateBatchScrapeStatus(ctx, &statusResponse, headers, pagination[0]) +} + +// autoPaginateBatchScrapeStatus follows Next URLs collecting all data, respecting +// MaxPages, MaxResults, and MaxWaitTime limits from the provided PaginationConfig. +func (app *FirecrawlApp) autoPaginateBatchScrapeStatus(ctx context.Context, initial *BatchScrapeStatusResponse, headers map[string]string, cfg *PaginationConfig) (*BatchScrapeStatusResponse, error) { + allData := initial.Data + current := initial + pagesCollected := 1 + startTime := time.Now() + + maxPages := 0 + if cfg.MaxPages != nil { + maxPages = *cfg.MaxPages + } + maxResults := 0 + if cfg.MaxResults != nil { + maxResults = *cfg.MaxResults + } + maxWaitSeconds := 0 + if cfg.MaxWaitTime != nil { + maxWaitSeconds = *cfg.MaxWaitTime + } + + for current.Next != nil { + // Check page limit. + if maxPages > 0 && pagesCollected >= maxPages { + break + } + // Check result limit. + if maxResults > 0 && len(allData) >= maxResults { + allData = allData[:maxResults] + break + } + // Check time limit. + if maxWaitSeconds > 0 && int(time.Since(startTime).Seconds()) >= maxWaitSeconds { + break + } + + if ctx.Err() != nil { + return nil, ctx.Err() + } + + if err := validatePaginationURL(app.APIURL, *current.Next); err != nil { + return nil, fmt.Errorf("unsafe pagination URL: %w", err) + } + + resp, err := app.makeRequest( + ctx, + http.MethodGet, + *current.Next, + nil, + headers, + "fetch next page of batch scrape status", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + var pageData BatchScrapeStatusResponse + if err := json.Unmarshal(resp, &pageData); err != nil { + return nil, fmt.Errorf("failed to parse batch scrape status page: %w", err) + } + + if pageData.Data != nil { + allData = append(allData, pageData.Data...) + } + current = &pageData + pagesCollected++ + } + + current.Data = allData + return current, nil +} + +// GetBatchScrapeStatusPage fetches a specific page of batch scrape status results by URL. +// Use this for manual pagination — pass the Next URL from a previous BatchScrapeStatusResponse. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - nextURL: The full URL of the next results page (from BatchScrapeStatusResponse.Next). +// +// Returns: +// - *BatchScrapeStatusResponse: The results for this page. +// - error: An error if the request fails or the URL is not trusted. +func (app *FirecrawlApp) GetBatchScrapeStatusPage(ctx context.Context, nextURL string) (*BatchScrapeStatusResponse, error) { + if err := validatePaginationURL(app.APIURL, nextURL); err != nil { + return nil, fmt.Errorf("unsafe pagination URL: %w", err) + } + + headers := app.prepareHeaders(nil) + + resp, err := app.makeRequest( + ctx, + http.MethodGet, + nextURL, + nil, + headers, + "fetch batch scrape status page", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + var statusResponse BatchScrapeStatusResponse + if err := json.Unmarshal(resp, &statusResponse); err != nil { + return nil, fmt.Errorf("failed to parse batch scrape status page: %w", err) + } + + return &statusResponse, nil +} + +// monitorBatchScrapeStatus polls a batch scrape job until completion. +// Mirrors monitorJobStatus from helpers.go but returns BatchScrapeStatusResponse. +func (app *FirecrawlApp) monitorBatchScrapeStatus(ctx context.Context, id string, headers map[string]string, pollInterval int) (*BatchScrapeStatusResponse, error) { + attempts := 0 + + for { + if ctx.Err() != nil { + return nil, ctx.Err() + } + + resp, err := app.makeRequest( + ctx, + http.MethodGet, + fmt.Sprintf("%s/v2/batch/scrape/%s", app.APIURL, id), + nil, + headers, + "check batch scrape status", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + var statusData BatchScrapeStatusResponse + if err := json.Unmarshal(resp, &statusData); err != nil { + return nil, err + } + + status := statusData.Status + if status == "" { + return nil, fmt.Errorf("invalid status in batch scrape response") + } + + switch status { + case "completed": + if statusData.Data != nil { + allData := statusData.Data + for statusData.Next != nil { + if ctx.Err() != nil { + return nil, ctx.Err() + } + + if err := validatePaginationURL(app.APIURL, *statusData.Next); err != nil { + return nil, fmt.Errorf("unsafe pagination URL: %w", err) + } + + resp, err := app.makeRequest( + ctx, + http.MethodGet, + *statusData.Next, + nil, + headers, + "fetch next page of batch scrape status", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + if err := json.Unmarshal(resp, &statusData); err != nil { + return nil, err + } + + if statusData.Data != nil { + allData = append(allData, statusData.Data...) + } + } + statusData.Data = allData + return &statusData, nil + } + attempts++ + if attempts > 3 { + return nil, fmt.Errorf("batch scrape job completed but no data was returned") + } + case "scraping": + interval := max(pollInterval, 2) + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(time.Duration(interval) * time.Second): + } + case "failed": + return nil, fmt.Errorf("batch scrape job failed. Status: %s", status) + default: + return nil, fmt.Errorf("unknown batch scrape status: %s", status) + } + } +} diff --git a/batch_test.go b/batch_test.go new file mode 100644 index 0000000..0255ce8 --- /dev/null +++ b/batch_test.go @@ -0,0 +1,491 @@ +package firecrawl + +import ( + "context" + "net/http" + "sync/atomic" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// validBatchID is a valid UUID used across batch scrape tests. +const validBatchID = "660e8400-e29b-41d4-a716-446655440001" + +// ---- AsyncBatchScrapeURLs ---- + +func TestAsyncBatchScrapeURLs_Success(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodPost, r.Method) + assert.Equal(t, "/v2/batch/scrape", r.URL.Path) + + var body map[string]any + decodeJSONBody(t, r, &body) + urls, ok := body["urls"].([]any) + require.True(t, ok) + assert.Equal(t, "https://example.com", urls[0]) + + respondJSON(w, http.StatusOK, BatchScrapeResponse{ + Success: true, + ID: validBatchID, + }) + }) + + result, err := app.AsyncBatchScrapeURLs(context.Background(), []string{"https://example.com"}, nil, nil) + require.NoError(t, err) + assert.Equal(t, validBatchID, result.ID) + assert.True(t, result.Success) +} + +func TestAsyncBatchScrapeURLs_WithParams(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + var body map[string]any + decodeJSONBody(t, r, &body) + + urls, ok := body["urls"].([]any) + require.True(t, ok) + assert.Len(t, urls, 2) + + assert.NotNil(t, body["maxConcurrency"]) + assert.NotNil(t, body["ignoreInvalidURLs"]) + assert.NotNil(t, body["webhook"]) + assert.NotNil(t, body["scrapeOptions"]) + + respondJSON(w, http.StatusOK, BatchScrapeResponse{ + Success: true, + ID: validBatchID, + }) + }) + + params := &BatchScrapeParams{ + ScrapeOptions: ScrapeParams{ + Formats: []string{"markdown"}, + OnlyMainContent: ptr(true), + }, + MaxConcurrency: ptr(5), + IgnoreInvalidURLs: ptr(true), + Webhook: &WebhookConfig{ + URL: "https://webhook.example.com/callback", + Events: []string{"completed", "failed"}, + }, + } + + result, err := app.AsyncBatchScrapeURLs( + context.Background(), + []string{"https://example.com", "https://example.org"}, + params, + nil, + ) + require.NoError(t, err) + assert.Equal(t, validBatchID, result.ID) +} + +func TestAsyncBatchScrapeURLs_WithIdempotencyKey(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, "test-idem-key", r.Header.Get("x-idempotency-key")) + respondJSON(w, http.StatusOK, BatchScrapeResponse{ + Success: true, + ID: validBatchID, + }) + }) + + result, err := app.AsyncBatchScrapeURLs(context.Background(), []string{"https://example.com"}, nil, ptr("test-idem-key")) + require.NoError(t, err) + assert.Equal(t, validBatchID, result.ID) +} + +func TestAsyncBatchScrapeURLs_MissingID(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, BatchScrapeResponse{ + Success: true, + ID: "", // Missing ID + }) + }) + + _, err := app.AsyncBatchScrapeURLs(context.Background(), []string{"https://example.com"}, nil, nil) + assert.Error(t, err) + assert.Contains(t, err.Error(), "job ID") +} + +func TestAsyncBatchScrapeURLs_Unauthorized(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusUnauthorized, map[string]string{"error": "Invalid token"}) + }) + + _, err := app.AsyncBatchScrapeURLs(context.Background(), []string{"https://example.com"}, nil, nil) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrUnauthorized) +} + +// ---- BatchScrapeURLs ---- + +func TestBatchScrapeURLs_PollsUntilComplete(t *testing.T) { + var requestCount atomic.Int32 + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + count := requestCount.Add(1) + if r.Method == http.MethodPost && r.URL.Path == "/v2/batch/scrape" { + respondJSON(w, http.StatusOK, BatchScrapeResponse{ + Success: true, + ID: validBatchID, + }) + return + } + // First GET returns "scraping", subsequent returns "completed". + if count == 2 { + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{ + Status: "scraping", + Total: 2, + Completed: 0, + }) + return + } + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{ + Status: "completed", + Total: 2, + Completed: 2, + Data: []*FirecrawlDocument{{Markdown: "# Page 1"}, {Markdown: "# Page 2"}}, + }) + }) + + // Use pollInterval of 0 to skip the 2-second minimum enforcement + // (the select fires immediately when interval is 0 — this is tested via timeout context). + // To avoid the min(pollInterval,2) clamp blocking the test, we pass a cancelled-friendly path: + // the mock returns completed on the third request so no actual sleep occurs. + result, err := app.BatchScrapeURLs(context.Background(), []string{"https://a.com", "https://b.com"}, nil, nil, 0) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) + assert.Len(t, result.Data, 2) +} + +func TestBatchScrapeURLs_ContextCancelled(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() // Cancel immediately before any request + + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made with cancelled context") + }) + + _, err := app.BatchScrapeURLs(ctx, []string{"https://example.com"}, nil, nil) + assert.Error(t, err) + assert.ErrorIs(t, err, context.Canceled) +} + +func TestBatchScrapeURLs_Failed(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodPost { + respondJSON(w, http.StatusOK, BatchScrapeResponse{Success: true, ID: validBatchID}) + return + } + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{Status: "failed"}) + }) + + _, err := app.BatchScrapeURLs(context.Background(), []string{"https://example.com"}, nil, nil, 0) + assert.Error(t, err) + assert.Contains(t, err.Error(), "failed") +} + +func TestBatchScrapeURLs_DefaultPollInterval(t *testing.T) { + // Verify that omitting pollInterval uses the default (no panic, correct code path). + // The mock returns "completed" immediately so the polling sleep never fires. + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodPost { + respondJSON(w, http.StatusOK, BatchScrapeResponse{Success: true, ID: validBatchID}) + return + } + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{ + Status: "completed", + Total: 1, + Completed: 1, + Data: []*FirecrawlDocument{{Markdown: "# Page"}}, + }) + }) + + result, err := app.BatchScrapeURLs(context.Background(), []string{"https://example.com"}, nil, nil) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) +} + +// ---- CheckBatchScrapeStatus ---- + +func TestCheckBatchScrapeStatus_Success(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodGet, r.Method) + assert.Equal(t, "/v2/batch/scrape/"+validBatchID, r.URL.Path) + + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{ + Status: "completed", + Total: 3, + Completed: 3, + CreditsUsed: 3, + Data: []*FirecrawlDocument{{Markdown: "# Doc"}}, + }) + }) + + result, err := app.CheckBatchScrapeStatus(context.Background(), validBatchID) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) + assert.Equal(t, 3, result.Total) + assert.Equal(t, 3, result.Completed) + assert.Equal(t, 3, result.CreditsUsed) + assert.Len(t, result.Data, 1) +} + +func TestCheckBatchScrapeStatus_Scraping(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{ + Status: "scraping", + Total: 5, + Completed: 2, + }) + }) + + result, err := app.CheckBatchScrapeStatus(context.Background(), validBatchID) + require.NoError(t, err) + assert.Equal(t, "scraping", result.Status) + assert.Equal(t, 5, result.Total) + assert.Equal(t, 2, result.Completed) +} + +func TestCheckBatchScrapeStatus_InvalidID(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made for invalid ID") + }) + + _, err := app.CheckBatchScrapeStatus(context.Background(), "not-a-uuid") + assert.Error(t, err) + assert.Contains(t, err.Error(), "UUID") +} + +func TestCheckBatchScrapeStatus_PathTraversalID(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made for path traversal ID") + }) + + _, err := app.CheckBatchScrapeStatus(context.Background(), "../../etc/passwd") + assert.Error(t, err) +} + +func TestCheckBatchScrapeStatus_Unauthorized(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusUnauthorized, map[string]string{"error": "Invalid token"}) + }) + + _, err := app.CheckBatchScrapeStatus(context.Background(), validBatchID) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrUnauthorized) +} + +// ---- monitorBatchScrapeStatus ---- + +func TestMonitorBatchScrapeStatus_CompletedImmediately(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodGet, r.Method) + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{ + Status: "completed", + Total: 2, + Completed: 2, + Data: []*FirecrawlDocument{{Markdown: "# A"}, {Markdown: "# B"}}, + }) + }) + + headers := app.prepareHeaders(nil) + result, err := app.monitorBatchScrapeStatus(context.Background(), validBatchID, headers, 0) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) + assert.Len(t, result.Data, 2) +} + +func TestMonitorBatchScrapeStatus_Failed(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{Status: "failed"}) + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorBatchScrapeStatus(context.Background(), validBatchID, headers, 0) + assert.Error(t, err) + assert.Contains(t, err.Error(), "failed") +} + +func TestMonitorBatchScrapeStatus_UnknownStatus(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{Status: "pending"}) + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorBatchScrapeStatus(context.Background(), validBatchID, headers, 0) + assert.Error(t, err) + assert.Contains(t, err.Error(), "unknown batch scrape status") +} + +func TestMonitorBatchScrapeStatus_EmptyStatus(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{Status: ""}) + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorBatchScrapeStatus(context.Background(), validBatchID, headers, 0) + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid status") +} + +func TestMonitorBatchScrapeStatus_ContextCancelledBeforeRequest(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made with cancelled context") + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorBatchScrapeStatus(ctx, validBatchID, headers, 0) + assert.Error(t, err) + assert.ErrorIs(t, err, context.Canceled) +} + +func TestMonitorBatchScrapeStatus_CompletedNoData(t *testing.T) { + requestCount := 0 + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + requestCount++ + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{ + Status: "completed", + Total: 1, + Completed: 1, + Data: nil, + }) + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorBatchScrapeStatus(context.Background(), validBatchID, headers, 0) + assert.Error(t, err) + assert.Contains(t, err.Error(), "no data was returned") + assert.GreaterOrEqual(t, requestCount, 3) +} + +func TestMonitorBatchScrapeStatus_PaginationUnsafeURL(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + next := "https://attacker.example.com/steal?cursor=2" + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{ + Status: "completed", + Total: 2, + Completed: 2, + Data: []*FirecrawlDocument{{Markdown: "# Page 1"}}, + Next: &next, + }) + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorBatchScrapeStatus(context.Background(), validBatchID, headers, 0) + assert.Error(t, err) + assert.Contains(t, err.Error(), "unsafe pagination URL") +} + +// ---- CheckBatchScrapeStatus with PaginationConfig ---- + +func TestCheckBatchScrapeStatus_NoPagination_BackwardCompat(t *testing.T) { + // Calling without pagination parameter returns the single page (backward compatible). + var serverURL string + app, srv := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + next := serverURL + "/v2/batch/scrape/" + validBatchID + "?cursor=2" + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{ + Status: "completed", + Total: 2, + Completed: 2, + Data: []*FirecrawlDocument{{Markdown: "# Page 1"}}, + Next: &next, + }) + }) + serverURL = srv.URL + + result, err := app.CheckBatchScrapeStatus(context.Background(), validBatchID) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) + // Only the first page returned — Next is present but not followed. + assert.Len(t, result.Data, 1) + assert.NotNil(t, result.Next) +} + +func TestCheckBatchScrapeStatus_AutoPaginate_FollowsNextURLs(t *testing.T) { + requestCount := 0 + var serverURL string + app, srv := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + requestCount++ + if requestCount == 1 { + // First page: has a Next URL. + next := serverURL + "/v2/batch/scrape/" + validBatchID + "?cursor=2" + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{ + Status: "completed", + Total: 2, + Completed: 2, + Data: []*FirecrawlDocument{{Markdown: "# Page 1"}}, + Next: &next, + }) + return + } + // Second page: no Next URL, pagination ends. + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{ + Status: "completed", + Total: 2, + Completed: 2, + Data: []*FirecrawlDocument{{Markdown: "# Page 2"}}, + }) + }) + serverURL = srv.URL + + cfg := &PaginationConfig{AutoPaginate: ptr(true)} + result, err := app.CheckBatchScrapeStatus(context.Background(), validBatchID, cfg) + require.NoError(t, err) + assert.Equal(t, 2, requestCount) + assert.Len(t, result.Data, 2) + assert.Equal(t, "# Page 1", result.Data[0].Markdown) + assert.Equal(t, "# Page 2", result.Data[1].Markdown) +} + +func TestCheckBatchScrapeStatus_AutoPaginate_UnsafeNextURL(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + next := "https://attacker.example.com/steal?cursor=2" + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{ + Status: "completed", + Total: 2, + Completed: 2, + Data: []*FirecrawlDocument{{Markdown: "# Page 1"}}, + Next: &next, + }) + }) + + cfg := &PaginationConfig{AutoPaginate: ptr(true)} + _, err := app.CheckBatchScrapeStatus(context.Background(), validBatchID, cfg) + assert.Error(t, err) + assert.Contains(t, err.Error(), "unsafe pagination URL") +} + +// ---- GetBatchScrapeStatusPage ---- + +func TestGetBatchScrapeStatusPage_Success(t *testing.T) { + app, srv := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodGet, r.Method) + respondJSON(w, http.StatusOK, BatchScrapeStatusResponse{ + Status: "completed", + Total: 5, + Completed: 5, + Data: []*FirecrawlDocument{{Markdown: "# Page 2"}}, + }) + }) + + nextURL := srv.URL + "/v2/batch/scrape/" + validBatchID + "?cursor=2" + result, err := app.GetBatchScrapeStatusPage(context.Background(), nextURL) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) + assert.Len(t, result.Data, 1) + assert.Equal(t, "# Page 2", result.Data[0].Markdown) +} + +func TestGetBatchScrapeStatusPage_InvalidURL_SSRFBlocked(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made to untrusted host") + }) + + _, err := app.GetBatchScrapeStatusPage(context.Background(), "https://attacker.example.com/steal") + assert.Error(t, err) + assert.Contains(t, err.Error(), "unsafe pagination URL") +} diff --git a/client.go b/client.go new file mode 100644 index 0000000..4f0919e --- /dev/null +++ b/client.go @@ -0,0 +1,171 @@ +package firecrawl + +import ( + "fmt" + "log" + "net/http" + "net/url" + "os" + "time" +) + +// FirecrawlApp represents a client for the Firecrawl API. +type FirecrawlApp struct { + apiKey string // unexported — use APIKey() accessor + APIURL string + Client *http.Client + Version string + userAgent string // set by constructor; sent as User-Agent header on every request +} + +// APIKey returns the configured API key. +func (app *FirecrawlApp) APIKey() string { + return app.apiKey +} + +// String returns a human-readable representation with the API key redacted. +func (app *FirecrawlApp) String() string { + redacted := "***" + if len(app.apiKey) > 7 { + redacted = app.apiKey[:3] + "..." + app.apiKey[len(app.apiKey)-4:] + } + return fmt.Sprintf("FirecrawlApp{url: %s, key: %s}", app.APIURL, redacted) +} + +// NewFirecrawlApp creates a new instance of FirecrawlApp with the provided API key and API URL. +// If the API key or API URL is not provided, it attempts to retrieve them from environment variables. +// If the API key is still not found, it returns an error. +// +// Parameters: +// - apiKey: The API key for authenticating with the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_KEY environment variable. +// - apiURL: The base URL for the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_URL environment variable, defaulting to "https://api.firecrawl.dev". +// - timeout: The timeout for the HTTP client. If not provided, it will default to 60 seconds. +// +// Returns: +// - *FirecrawlApp: A new instance of FirecrawlApp configured with the provided or retrieved API key and API URL. +// - error: An error if the API key is not provided or retrieved. +func NewFirecrawlApp(apiKey, apiURL string, timeout ...time.Duration) (*FirecrawlApp, error) { + if apiKey == "" { + apiKey = os.Getenv("FIRECRAWL_API_KEY") + if apiKey == "" { + return nil, fmt.Errorf("%w", ErrNoAPIKey) + } + } + + if apiURL == "" { + apiURL = os.Getenv("FIRECRAWL_API_URL") + if apiURL == "" { + apiURL = "https://api.firecrawl.dev" + } + } + + // Warn when a non-localhost HTTP URL is used — API key will be sent in cleartext. + parsedURL, err := url.Parse(apiURL) + if err == nil && parsedURL.Scheme == "http" { + host := parsedURL.Hostname() + if host != "localhost" && host != "127.0.0.1" && host != "::1" { + log.Println("WARNING: firecrawl-go: API URL uses HTTP. API key will be transmitted in cleartext. Use HTTPS in production.") + } + } + + cfg := defaultClientConfig() + if len(timeout) > 0 { + cfg.timeout = timeout[0] + } + + return newFirecrawlAppFromConfig(apiKey, apiURL, cfg) +} + +// NewFirecrawlAppWithOptions creates a new instance of FirecrawlApp using +// functional options for configuration. +// +// Parameters: +// - apiKey: The API key for authenticating with the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_KEY environment variable. +// - apiURL: The base URL for the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_URL environment variable, defaulting to "https://api.firecrawl.dev". +// - opts: Functional options (WithTimeout, WithTransport, WithUserAgent, WithMaxIdleConns, WithMaxIdleConnsPerHost). +// +// Returns: +// - *FirecrawlApp: A new instance of FirecrawlApp configured with the provided or retrieved API key, API URL, and options. +// - error: An error if the API key is not provided or retrieved. +func NewFirecrawlAppWithOptions(apiKey, apiURL string, opts ...ClientOption) (*FirecrawlApp, error) { + if apiKey == "" { + apiKey = os.Getenv("FIRECRAWL_API_KEY") + if apiKey == "" { + return nil, fmt.Errorf("%w", ErrNoAPIKey) + } + } + + if apiURL == "" { + apiURL = os.Getenv("FIRECRAWL_API_URL") + if apiURL == "" { + apiURL = "https://api.firecrawl.dev" + } + } + + // Warn when a non-localhost HTTP URL is used — API key will be sent in cleartext. + parsedURL, err := url.Parse(apiURL) + if err == nil && parsedURL.Scheme == "http" { + host := parsedURL.Hostname() + if host != "localhost" && host != "127.0.0.1" && host != "::1" { + log.Println("WARNING: firecrawl-go: API URL uses HTTP. API key will be transmitted in cleartext. Use HTTPS in production.") + } + } + + cfg := defaultClientConfig() + for _, opt := range opts { + opt(cfg) + } + + return newFirecrawlAppFromConfig(apiKey, apiURL, cfg) +} + +// newFirecrawlAppFromConfig builds a FirecrawlApp from a resolved clientConfig. +// apiKey and apiURL must already be validated and resolved before calling this. +func newFirecrawlAppFromConfig(apiKey, apiURL string, cfg *clientConfig) (*FirecrawlApp, error) { + var transport http.RoundTripper + if cfg.transport != nil { + transport = cfg.transport + } else { + defaultT, ok := http.DefaultTransport.(*http.Transport) + if !ok { + return nil, fmt.Errorf("firecrawl-go: http.DefaultTransport is not *http.Transport; use WithTransport to supply a custom transport") + } + cloned := defaultT.Clone() + cloned.MaxIdleConns = cfg.maxIdleConns + cloned.MaxIdleConnsPerHost = cfg.maxIdleConnsPerHost + transport = cloned + } + + client := &http.Client{ + Timeout: cfg.timeout, + Transport: transport, + } + + return &FirecrawlApp{ + apiKey: apiKey, + APIURL: apiURL, + Client: client, + Version: SDKVersion, + userAgent: cfg.userAgent, + }, nil +} + +// prepareHeaders prepares the headers for an HTTP request. +// +// Parameters: +// - idempotencyKey: A string representing the idempotency key to be included in the headers. +// If the idempotency key is an empty string, it will not be included in the headers. +// +// Returns: +// - map[string]string: A map containing the headers for the HTTP request. +func (app *FirecrawlApp) prepareHeaders(idempotencyKey *string) map[string]string { + headers := map[string]string{ + "Content-Type": "application/json", + "Authorization": fmt.Sprintf("Bearer %s", app.apiKey), + "User-Agent": app.userAgent, + } + if idempotencyKey != nil { + headers["x-idempotency-key"] = *idempotencyKey + } + return headers +} diff --git a/client_options.go b/client_options.go new file mode 100644 index 0000000..a5a2525 --- /dev/null +++ b/client_options.go @@ -0,0 +1,74 @@ +package firecrawl + +import ( + "net/http" + "time" +) + +// SDKVersion is the current version of the firecrawl-go SDK. +const SDKVersion = "2.0.0" + +// clientConfig holds the configuration for building the FirecrawlApp HTTP client. +type clientConfig struct { + timeout time.Duration + transport *http.Transport + userAgent string + maxIdleConns int + maxIdleConnsPerHost int +} + +// defaultClientConfig returns sensible defaults for the HTTP client configuration. +func defaultClientConfig() *clientConfig { + return &clientConfig{ + timeout: 120 * time.Second, + userAgent: "firecrawl-go/" + SDKVersion, + maxIdleConns: 100, + maxIdleConnsPerHost: 10, + } +} + +// ClientOption configures the FirecrawlApp HTTP client. +type ClientOption func(*clientConfig) + +// WithTimeout sets the HTTP client timeout. +// +// This is the recommended alternative to passing a time.Duration as the variadic +// argument to NewFirecrawlApp. Default: 120 seconds. +func WithTimeout(d time.Duration) ClientOption { + return func(c *clientConfig) { + c.timeout = d + } +} + +// WithTransport sets a custom http.Transport for the HTTP client. +// When set, WithMaxIdleConns and WithMaxIdleConnsPerHost are ignored — +// configure those directly on the transport you provide. +func WithTransport(t *http.Transport) ClientOption { + return func(c *clientConfig) { + c.transport = t + } +} + +// WithUserAgent sets a custom User-Agent header sent with every request. +// Default: "firecrawl-go/{version}". +func WithUserAgent(ua string) ClientOption { + return func(c *clientConfig) { + c.userAgent = ua + } +} + +// WithMaxIdleConns sets the maximum number of idle (keep-alive) connections +// across all hosts. Only applies when no custom Transport is provided. Default: 100. +func WithMaxIdleConns(n int) ClientOption { + return func(c *clientConfig) { + c.maxIdleConns = n + } +} + +// WithMaxIdleConnsPerHost sets the maximum number of idle (keep-alive) connections +// per host. Only applies when no custom Transport is provided. Default: 10. +func WithMaxIdleConnsPerHost(n int) ClientOption { + return func(c *clientConfig) { + c.maxIdleConnsPerHost = n + } +} diff --git a/client_test.go b/client_test.go new file mode 100644 index 0000000..f1d6fa1 --- /dev/null +++ b/client_test.go @@ -0,0 +1,214 @@ +package firecrawl + +import ( + "fmt" + "net/http" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestNewFirecrawlApp_ValidKey(t *testing.T) { + app, err := NewFirecrawlApp("fc-test-key", "https://api.example.com") + require.NoError(t, err) + assert.Equal(t, "fc-test-key", app.APIKey()) + assert.Equal(t, "https://api.example.com", app.APIURL) +} + +func TestNewFirecrawlApp_EmptyKey(t *testing.T) { + // Unset env var to ensure no fallback + t.Setenv("FIRECRAWL_API_KEY", "") + _, err := NewFirecrawlApp("", "https://api.example.com") + assert.Error(t, err) + assert.ErrorIs(t, err, ErrNoAPIKey) +} + +func TestNewFirecrawlApp_DefaultURL(t *testing.T) { + t.Setenv("FIRECRAWL_API_URL", "") + app, err := NewFirecrawlApp("fc-test-key", "") + require.NoError(t, err) + assert.Equal(t, "https://api.firecrawl.dev", app.APIURL) +} + +func TestNewFirecrawlApp_EnvFallback(t *testing.T) { + t.Setenv("FIRECRAWL_API_KEY", "fc-env-key") + app, err := NewFirecrawlApp("", "https://api.example.com") + require.NoError(t, err) + assert.Equal(t, "fc-env-key", app.APIKey()) +} + +func TestNewFirecrawlApp_EnvURLFallback(t *testing.T) { + t.Setenv("FIRECRAWL_API_URL", "https://custom.api.example.com") + app, err := NewFirecrawlApp("fc-test-key", "") + require.NoError(t, err) + assert.Equal(t, "https://custom.api.example.com", app.APIURL) +} + +func TestNewFirecrawlApp_ClientNotNil(t *testing.T) { + // Verify the HTTP client is properly initialized + app, err := NewFirecrawlApp("fc-test-key", "https://api.example.com") + require.NoError(t, err) + assert.NotNil(t, app.Client) +} + +func TestPrepareHeaders_WithIdempotencyKey(t *testing.T) { + app, err := NewFirecrawlApp("fc-test-key", "https://api.firecrawl.dev") + require.NoError(t, err) + + key := "my-idempotency-key" + headers := app.prepareHeaders(&key) + assert.Equal(t, "Bearer fc-test-key", headers["Authorization"]) + assert.Equal(t, "application/json", headers["Content-Type"]) + assert.Equal(t, "my-idempotency-key", headers["x-idempotency-key"]) +} + +func TestPrepareHeaders_WithEmptyIdempotencyKey(t *testing.T) { + app, err := NewFirecrawlApp("fc-test-key", "https://api.firecrawl.dev") + require.NoError(t, err) + + key := "" + headers := app.prepareHeaders(&key) + assert.Equal(t, "Bearer fc-test-key", headers["Authorization"]) + // Empty key pointer — the key is still included (empty string) + assert.Equal(t, "", headers["x-idempotency-key"]) +} + +func TestPrepareHeaders_NilIdempotencyKey(t *testing.T) { + app, err := NewFirecrawlApp("fc-test-key", "https://api.firecrawl.dev") + require.NoError(t, err) + + headers := app.prepareHeaders(nil) + assert.Equal(t, "Bearer fc-test-key", headers["Authorization"]) + assert.Equal(t, "application/json", headers["Content-Type"]) + _, hasKey := headers["x-idempotency-key"] + assert.False(t, hasKey, "nil idempotency key should not set the header") +} + +func TestPrepareHeaders_AuthorizationFormat(t *testing.T) { + app, err := NewFirecrawlApp("fc-my-secret-key", "https://api.firecrawl.dev") + require.NoError(t, err) + + headers := app.prepareHeaders(nil) + assert.Equal(t, "Bearer fc-my-secret-key", headers["Authorization"]) +} + +// IMP-15: HTTP Client Improvements tests + +func TestSDKVersion_NotEmpty(t *testing.T) { + assert.NotEmpty(t, SDKVersion, "SDKVersion constant must not be empty") +} + +func TestDefaultUserAgent(t *testing.T) { + app, err := NewFirecrawlApp("fc-test-key", "https://api.example.com") + require.NoError(t, err) + + headers := app.prepareHeaders(nil) + expectedUA := fmt.Sprintf("firecrawl-go/%s", SDKVersion) + assert.Equal(t, expectedUA, headers["User-Agent"], "default User-Agent should be firecrawl-go/{version}") +} + +func TestDefaultUserAgent_WithOptions(t *testing.T) { + app, err := NewFirecrawlAppWithOptions("fc-test-key", "https://api.example.com") + require.NoError(t, err) + + headers := app.prepareHeaders(nil) + expectedUA := fmt.Sprintf("firecrawl-go/%s", SDKVersion) + assert.Equal(t, expectedUA, headers["User-Agent"], "default User-Agent via WithOptions should be firecrawl-go/{version}") +} + +func TestCustomUserAgent(t *testing.T) { + app, err := NewFirecrawlAppWithOptions( + "fc-test-key", + "https://api.example.com", + WithUserAgent("my-custom-agent/1.0"), + ) + require.NoError(t, err) + + headers := app.prepareHeaders(nil) + assert.Equal(t, "my-custom-agent/1.0", headers["User-Agent"]) +} + +func TestWithTimeout(t *testing.T) { + wantTimeout := 30 * time.Second + app, err := NewFirecrawlAppWithOptions( + "fc-test-key", + "https://api.example.com", + WithTimeout(wantTimeout), + ) + require.NoError(t, err) + assert.Equal(t, wantTimeout, app.Client.Timeout) +} + +func TestWithTransport(t *testing.T) { + customTransport := &http.Transport{ + MaxIdleConns: 50, + MaxIdleConnsPerHost: 5, + } + + app, err := NewFirecrawlAppWithOptions( + "fc-test-key", + "https://api.example.com", + WithTransport(customTransport), + ) + require.NoError(t, err) + assert.Equal(t, customTransport, app.Client.Transport, "custom transport should be used") +} + +func TestWithMaxIdleConns(t *testing.T) { + app, err := NewFirecrawlAppWithOptions( + "fc-test-key", + "https://api.example.com", + WithMaxIdleConns(200), + WithMaxIdleConnsPerHost(20), + ) + require.NoError(t, err) + + transport, ok := app.Client.Transport.(*http.Transport) + require.True(t, ok, "transport should be *http.Transport when no custom transport is set") + assert.Equal(t, 200, transport.MaxIdleConns) + assert.Equal(t, 20, transport.MaxIdleConnsPerHost) +} + +func TestDefaultTransportCloned(t *testing.T) { + app, err := NewFirecrawlApp("fc-test-key", "https://api.example.com") + require.NoError(t, err) + + // The transport must NOT be the same pointer as http.DefaultTransport — + // it should be a cloned copy so SDK settings don't bleed into the process. + assert.NotEqual(t, http.DefaultTransport, app.Client.Transport, + "transport should be a clone of http.DefaultTransport, not the same pointer") +} + +func TestBackwardCompatibility_NoTimeout(t *testing.T) { + app, err := NewFirecrawlApp("fc-test-key", "https://api.example.com") + require.NoError(t, err) + assert.Equal(t, 120*time.Second, app.Client.Timeout, "default timeout should be 120s") +} + +func TestBackwardCompatibility_WithTimeout(t *testing.T) { + app, err := NewFirecrawlApp("fc-test-key", "https://api.example.com", 30*time.Second) + require.NoError(t, err) + assert.Equal(t, 30*time.Second, app.Client.Timeout, "variadic timeout parameter should still work") +} + +func TestNewFirecrawlAppWithOptions_EmptyKey(t *testing.T) { + t.Setenv("FIRECRAWL_API_KEY", "") + _, err := NewFirecrawlAppWithOptions("", "https://api.example.com") + assert.Error(t, err) + assert.ErrorIs(t, err, ErrNoAPIKey) +} + +func TestNewFirecrawlAppWithOptions_DefaultURL(t *testing.T) { + t.Setenv("FIRECRAWL_API_URL", "") + app, err := NewFirecrawlAppWithOptions("fc-test-key", "") + require.NoError(t, err) + assert.Equal(t, "https://api.firecrawl.dev", app.APIURL) +} + +func TestVersionFieldSet(t *testing.T) { + app, err := NewFirecrawlApp("fc-test-key", "https://api.example.com") + require.NoError(t, err) + assert.Equal(t, SDKVersion, app.Version, "Version field should be set to SDKVersion") +} diff --git a/crawl.go b/crawl.go new file mode 100644 index 0000000..50af613 --- /dev/null +++ b/crawl.go @@ -0,0 +1,381 @@ +package firecrawl + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "time" +) + +// crawlRequest is the internal request struct for crawl operations. +// It is unexported — callers use CrawlParams instead. +type crawlRequest struct { + URL string `json:"url"` + ScrapeOptions *ScrapeParams `json:"scrapeOptions,omitempty"` + Webhook *WebhookConfig `json:"webhook,omitempty"` + Limit *int `json:"limit,omitempty"` + IncludePaths []string `json:"includePaths,omitempty"` + ExcludePaths []string `json:"excludePaths,omitempty"` + MaxDiscoveryDepth *int `json:"maxDiscoveryDepth,omitempty"` + AllowExternalLinks *bool `json:"allowExternalLinks,omitempty"` + IgnoreQueryParameters *bool `json:"ignoreQueryParameters,omitempty"` + Sitemap *string `json:"sitemap,omitempty"` + CrawlEntireDomain *bool `json:"crawlEntireDomain,omitempty"` + AllowSubdomains *bool `json:"allowSubdomains,omitempty"` + Delay *float64 `json:"delay,omitempty"` + MaxConcurrency *int `json:"maxConcurrency,omitempty"` + Prompt *string `json:"prompt,omitempty"` + RegexOnFullURL *bool `json:"regexOnFullURL,omitempty"` + ZeroDataRetention *bool `json:"zeroDataRetention,omitempty"` +} + +// buildCrawlRequest creates a crawlRequest from URL and CrawlParams. +// Shared by CrawlURL and AsyncCrawlURL to eliminate duplicated body construction. +func buildCrawlRequest(url string, params *CrawlParams) (*crawlRequest, error) { + req := &crawlRequest{URL: url} + if params == nil { + return req, nil + } + + // Only include ScrapeOptions if at least one field is set. + scrapeOpts := params.ScrapeOptions + if scrapeOpts.Formats != nil || scrapeOpts.Headers != nil || scrapeOpts.IncludeTags != nil || + scrapeOpts.ExcludeTags != nil || scrapeOpts.OnlyMainContent != nil || scrapeOpts.WaitFor != nil || + scrapeOpts.Timeout != nil || scrapeOpts.MaxAge != nil || scrapeOpts.MinAge != nil || + scrapeOpts.JsonOptions != nil || scrapeOpts.Mobile != nil || scrapeOpts.SkipTlsVerification != nil || + scrapeOpts.BlockAds != nil || scrapeOpts.Proxy != nil || scrapeOpts.Location != nil || + scrapeOpts.Parsers != nil || scrapeOpts.Actions != nil || scrapeOpts.RemoveBase64Images != nil || + scrapeOpts.StoreInCache != nil || scrapeOpts.ZeroDataRetention != nil { + req.ScrapeOptions = &scrapeOpts + } + + req.Webhook = params.Webhook + req.Limit = params.Limit + req.IncludePaths = params.IncludePaths + req.ExcludePaths = params.ExcludePaths + req.MaxDiscoveryDepth = params.MaxDiscoveryDepth + req.AllowExternalLinks = params.AllowExternalLinks + req.IgnoreQueryParameters = params.IgnoreQueryParameters + req.Sitemap = params.Sitemap + req.CrawlEntireDomain = params.CrawlEntireDomain + req.AllowSubdomains = params.AllowSubdomains + req.Delay = params.Delay + req.MaxConcurrency = params.MaxConcurrency + req.Prompt = params.Prompt + req.RegexOnFullURL = params.RegexOnFullURL + req.ZeroDataRetention = params.ZeroDataRetention + + return req, nil +} + +// CrawlURL starts a crawl job for the specified URL using the Firecrawl API. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - url: The URL to crawl. +// - params: Optional parameters for the crawl request. +// - idempotencyKey: An optional idempotency key to ensure the request is idempotent (can be nil). +// - pollInterval: An optional interval (in seconds) at which to poll the job status. Default is 2 seconds. +// +// Returns: +// - CrawlStatusResponse: The crawl result if the job is completed. +// - error: An error if the crawl request fails. +func (app *FirecrawlApp) CrawlURL(ctx context.Context, url string, params *CrawlParams, idempotencyKey *string, pollInterval ...int) (*CrawlStatusResponse, error) { + var key string + if idempotencyKey != nil { + key = *idempotencyKey + } + + headers := app.prepareHeaders(&key) + + req, err := buildCrawlRequest(url, params) + if err != nil { + return nil, fmt.Errorf("failed to build crawl request: %w", err) + } + + crawlBodyBytes, err := json.Marshal(req) + if err != nil { + return nil, fmt.Errorf("failed to marshal crawl request: %w", err) + } + + actualPollInterval := 2 + if len(pollInterval) > 0 { + actualPollInterval = pollInterval[0] + } + + resp, err := app.makeRequest( + ctx, + http.MethodPost, + fmt.Sprintf("%s/v2/crawl", app.APIURL), + crawlBodyBytes, + headers, + "start crawl job", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + var crawlResponse CrawlResponse + err = json.Unmarshal(resp, &crawlResponse) + if err != nil { + return nil, err + } + + return app.monitorJobStatus(ctx, crawlResponse.ID, headers, actualPollInterval) +} + +// AsyncCrawlURL starts a crawl job for the specified URL using the Firecrawl API. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - url: The URL to crawl. +// - params: Optional parameters for the crawl request. +// - idempotencyKey: An optional idempotency key to ensure the request is idempotent. +// +// Returns: +// - *CrawlResponse: The crawl response with id. +// - error: An error if the crawl request fails. +func (app *FirecrawlApp) AsyncCrawlURL(ctx context.Context, url string, params *CrawlParams, idempotencyKey *string) (*CrawlResponse, error) { + var key string + if idempotencyKey != nil { + key = *idempotencyKey + } + + headers := app.prepareHeaders(&key) + + req, err := buildCrawlRequest(url, params) + if err != nil { + return nil, fmt.Errorf("failed to build crawl request: %w", err) + } + + crawlBodyBytes, err := json.Marshal(req) + if err != nil { + return nil, fmt.Errorf("failed to marshal crawl request: %w", err) + } + + resp, err := app.makeRequest( + ctx, + http.MethodPost, + fmt.Sprintf("%s/v2/crawl", app.APIURL), + crawlBodyBytes, + headers, + "start crawl job", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + var crawlResponse CrawlResponse + err = json.Unmarshal(resp, &crawlResponse) + if err != nil { + return nil, err + } + + if crawlResponse.ID == "" { + return nil, fmt.Errorf("failed to get job ID") + } + + return &crawlResponse, nil +} + +// CheckCrawlStatus checks the status of a crawl job using the Firecrawl API. +// +// When a PaginationConfig is provided with AutoPaginate enabled, it automatically +// follows Next URLs to collect all results, respecting MaxPages, MaxResults, and +// MaxWaitTime limits. Without PaginationConfig, only the first page is returned. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - ID: The ID of the crawl job to check. +// - pagination: An optional PaginationConfig to control auto-pagination behavior. +// +// Returns: +// - *CrawlStatusResponse: The status of the crawl job (possibly spanning multiple pages). +// - error: An error if the crawl status check request fails. +func (app *FirecrawlApp) CheckCrawlStatus(ctx context.Context, ID string, pagination ...*PaginationConfig) (*CrawlStatusResponse, error) { + if err := validateJobID(ID); err != nil { + return nil, err + } + headers := app.prepareHeaders(nil) + apiURL := fmt.Sprintf("%s/v2/crawl/%s", app.APIURL, ID) + + resp, err := app.makeRequest( + ctx, + http.MethodGet, + apiURL, + nil, + headers, + "check crawl status", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + var jobStatusResponse CrawlStatusResponse + if err = json.Unmarshal(resp, &jobStatusResponse); err != nil { + return nil, err + } + + // Without PaginationConfig or AutoPaginate disabled, return the single page. + if len(pagination) == 0 || pagination[0] == nil || pagination[0].AutoPaginate == nil || !*pagination[0].AutoPaginate { + return &jobStatusResponse, nil + } + + return app.autoPaginateCrawlStatus(ctx, &jobStatusResponse, headers, pagination[0]) +} + +// autoPaginateCrawlStatus follows Next URLs collecting all data, respecting +// MaxPages, MaxResults, and MaxWaitTime limits from the provided PaginationConfig. +func (app *FirecrawlApp) autoPaginateCrawlStatus(ctx context.Context, initial *CrawlStatusResponse, headers map[string]string, cfg *PaginationConfig) (*CrawlStatusResponse, error) { + allData := initial.Data + current := initial + pagesCollected := 1 + startTime := time.Now() + + maxPages := 0 + if cfg.MaxPages != nil { + maxPages = *cfg.MaxPages + } + maxResults := 0 + if cfg.MaxResults != nil { + maxResults = *cfg.MaxResults + } + maxWaitSeconds := 0 + if cfg.MaxWaitTime != nil { + maxWaitSeconds = *cfg.MaxWaitTime + } + + for current.Next != nil { + // Check page limit. + if maxPages > 0 && pagesCollected >= maxPages { + break + } + // Check result limit. + if maxResults > 0 && len(allData) >= maxResults { + allData = allData[:maxResults] + break + } + // Check time limit. + if maxWaitSeconds > 0 && int(time.Since(startTime).Seconds()) >= maxWaitSeconds { + break + } + + if ctx.Err() != nil { + return nil, ctx.Err() + } + + if err := validatePaginationURL(app.APIURL, *current.Next); err != nil { + return nil, fmt.Errorf("unsafe pagination URL: %w", err) + } + + resp, err := app.makeRequest( + ctx, + http.MethodGet, + *current.Next, + nil, + headers, + "fetch next page of crawl status", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + var pageData CrawlStatusResponse + if err := json.Unmarshal(resp, &pageData); err != nil { + return nil, fmt.Errorf("failed to parse crawl status page: %w", err) + } + + if pageData.Data != nil { + allData = append(allData, pageData.Data...) + } + current = &pageData + pagesCollected++ + } + + current.Data = allData + return current, nil +} + +// GetCrawlStatusPage fetches a specific page of crawl status results by URL. +// Use this for manual pagination — pass the Next URL from a previous CrawlStatusResponse. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - nextURL: The full URL of the next results page (from CrawlStatusResponse.Next). +// +// Returns: +// - *CrawlStatusResponse: The results for this page. +// - error: An error if the request fails or the URL is not trusted. +func (app *FirecrawlApp) GetCrawlStatusPage(ctx context.Context, nextURL string) (*CrawlStatusResponse, error) { + if err := validatePaginationURL(app.APIURL, nextURL); err != nil { + return nil, fmt.Errorf("unsafe pagination URL: %w", err) + } + + headers := app.prepareHeaders(nil) + + resp, err := app.makeRequest( + ctx, + http.MethodGet, + nextURL, + nil, + headers, + "fetch crawl status page", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + var statusResponse CrawlStatusResponse + if err := json.Unmarshal(resp, &statusResponse); err != nil { + return nil, fmt.Errorf("failed to parse crawl status page: %w", err) + } + + return &statusResponse, nil +} + +// CancelCrawlJob cancels a crawl job using the Firecrawl API. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - ID: The ID of the crawl job to cancel. +// +// Returns: +// - string: The status of the crawl job after cancellation. +// - error: An error if the crawl job cancellation request fails. +func (app *FirecrawlApp) CancelCrawlJob(ctx context.Context, ID string) (string, error) { + if err := validateJobID(ID); err != nil { + return "", err + } + headers := app.prepareHeaders(nil) + apiURL := fmt.Sprintf("%s/v2/crawl/%s", app.APIURL, ID) + resp, err := app.makeRequest( + ctx, + http.MethodDelete, + apiURL, + nil, + headers, + "cancel crawl job", + ) + if err != nil { + return "", err + } + + var cancelCrawlJobResponse CancelCrawlJobResponse + err = json.Unmarshal(resp, &cancelCrawlJobResponse) + if err != nil { + return "", err + } + + return cancelCrawlJobResponse.Status, nil +} diff --git a/crawl_test.go b/crawl_test.go new file mode 100644 index 0000000..cc8879e --- /dev/null +++ b/crawl_test.go @@ -0,0 +1,568 @@ +package firecrawl + +import ( + "context" + "fmt" + "net/http" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// validCrawlID is a valid UUID used across crawl tests. +const validCrawlID = "550e8400-e29b-41d4-a716-446655440000" + +// ---- CrawlURL ---- + +func TestCrawlURL_Success(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodPost && r.URL.Path == "/v2/crawl" { + respondJSON(w, http.StatusOK, CrawlResponse{ + Success: true, + ID: validCrawlID, + }) + return + } + // GET /v2/crawl/{id} — immediately completed + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 1, + Completed: 1, + Data: []*FirecrawlDocument{{Markdown: "# Page"}}, + }) + }) + + result, err := app.CrawlURL(context.Background(), "https://example.com", nil, nil) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) + assert.Len(t, result.Data, 1) + assert.Equal(t, "# Page", result.Data[0].Markdown) +} + +func TestCrawlURL_AllParams(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodPost { + var body map[string]any + decodeJSONBody(t, r, &body) + assert.Equal(t, "https://example.com", body["url"]) + assert.NotNil(t, body["limit"]) + assert.NotNil(t, body["maxDiscoveryDepth"]) + assert.NotNil(t, body["crawlEntireDomain"]) + assert.NotNil(t, body["allowSubdomains"]) + assert.NotNil(t, body["ignoreQueryParameters"]) + assert.NotNil(t, body["zeroDataRetention"]) + respondJSON(w, http.StatusOK, CrawlResponse{ + Success: true, + ID: validCrawlID, + }) + return + } + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 1, + Completed: 1, + Data: []*FirecrawlDocument{{Markdown: "# Page"}}, + }) + }) + + params := &CrawlParams{ + Limit: ptr(100), + MaxDiscoveryDepth: ptr(3), + CrawlEntireDomain: ptr(true), + AllowSubdomains: ptr(true), + IgnoreQueryParameters: ptr(true), + ZeroDataRetention: ptr(true), + IncludePaths: []string{"/docs/*"}, + ExcludePaths: []string{"/admin/*"}, + } + result, err := app.CrawlURL(context.Background(), "https://example.com", params, nil) + require.NoError(t, err) + assert.NotNil(t, result) +} + +func TestCrawlURL_WithIdempotencyKey(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodPost { + assert.Equal(t, "test-idempotency-key", r.Header.Get("x-idempotency-key")) + respondJSON(w, http.StatusOK, CrawlResponse{ + Success: true, + ID: validCrawlID, + }) + return + } + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 1, + Completed: 1, + Data: []*FirecrawlDocument{{Markdown: "# Page"}}, + }) + }) + + result, err := app.CrawlURL(context.Background(), "https://example.com", nil, ptr("test-idempotency-key")) + require.NoError(t, err) + assert.NotNil(t, result) +} + +func TestCrawlURL_Failed(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodPost { + respondJSON(w, http.StatusOK, CrawlResponse{Success: true, ID: validCrawlID}) + return + } + respondJSON(w, http.StatusOK, CrawlStatusResponse{Status: "failed"}) + }) + + _, err := app.CrawlURL(context.Background(), "https://example.com", nil, nil) + assert.Error(t, err) + assert.Contains(t, err.Error(), "failed") +} + +func TestCrawlURL_PollsUntilComplete(t *testing.T) { + pollCount := 0 + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodPost { + respondJSON(w, http.StatusOK, CrawlResponse{ + Success: true, + ID: validCrawlID, + }) + return + } + // GET: Return completed immediately — tests that the polling loop works + // and correctly collects data on first successful poll. + pollCount++ + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 5, + Completed: 5, + Data: []*FirecrawlDocument{{Markdown: "# Page 1"}, {Markdown: "# Page 2"}}, + }) + }) + + result, err := app.CrawlURL(context.Background(), "https://example.com", nil, nil) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) + assert.Len(t, result.Data, 2) + assert.GreaterOrEqual(t, pollCount, 1) +} + +func TestCrawlURL_ContextCancelled(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() // Cancel immediately + + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made with cancelled context") + }) + + _, err := app.CrawlURL(ctx, "https://example.com", nil, nil) + assert.Error(t, err) + assert.ErrorIs(t, err, context.Canceled) +} + +func TestCrawlURL_Unauthorized(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusUnauthorized, map[string]string{"error": "Invalid token"}) + }) + + _, err := app.CrawlURL(context.Background(), "https://example.com", nil, nil) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrUnauthorized) +} + +// Note: pagination is tested directly via monitorJobStatus in helpers_test.go. +// CrawlURL delegates pagination to monitorJobStatus, so it is implicitly covered. + +// ---- AsyncCrawlURL ---- + +func TestAsyncCrawlURL_Success(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodPost, r.Method) + assert.Equal(t, "/v2/crawl", r.URL.Path) + + var body map[string]any + decodeJSONBody(t, r, &body) + assert.Equal(t, "https://example.com", body["url"]) + + respondJSON(w, http.StatusOK, CrawlResponse{ + Success: true, + ID: validCrawlID, + }) + }) + + result, err := app.AsyncCrawlURL(context.Background(), "https://example.com", nil, nil) + require.NoError(t, err) + assert.Equal(t, validCrawlID, result.ID) + assert.True(t, result.Success) +} + +func TestAsyncCrawlURL_AllParams(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + var body map[string]any + decodeJSONBody(t, r, &body) + assert.Equal(t, "https://example.com", body["url"]) + assert.NotNil(t, body["limit"]) + assert.NotNil(t, body["webhook"]) + + respondJSON(w, http.StatusOK, CrawlResponse{ + Success: true, + ID: validCrawlID, + }) + }) + + params := &CrawlParams{ + Limit: ptr(50), + Webhook: &WebhookConfig{ + URL: "https://webhook.example.com/callback", + Events: []string{"completed", "failed"}, + }, + } + result, err := app.AsyncCrawlURL(context.Background(), "https://example.com", params, nil) + require.NoError(t, err) + assert.Equal(t, validCrawlID, result.ID) +} + +func TestAsyncCrawlURL_MissingID(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, CrawlResponse{ + Success: true, + ID: "", // Missing ID + }) + }) + + _, err := app.AsyncCrawlURL(context.Background(), "https://example.com", nil, nil) + assert.Error(t, err) + assert.Contains(t, err.Error(), "job ID") +} + +func TestAsyncCrawlURL_Unauthorized(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusUnauthorized, map[string]string{"error": "Invalid token"}) + }) + + _, err := app.AsyncCrawlURL(context.Background(), "https://example.com", nil, nil) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrUnauthorized) +} + +// ---- CheckCrawlStatus ---- + +func TestCheckCrawlStatus_Success(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodGet, r.Method) + assert.Equal(t, "/v2/crawl/"+validCrawlID, r.URL.Path) + + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 10, + Completed: 10, + Data: []*FirecrawlDocument{{Markdown: "# Page"}}, + }) + }) + + result, err := app.CheckCrawlStatus(context.Background(), validCrawlID) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) + assert.Equal(t, 10, result.Total) + assert.Equal(t, 10, result.Completed) +} + +func TestCheckCrawlStatus_InvalidID(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made for invalid ID") + }) + + _, err := app.CheckCrawlStatus(context.Background(), "not-a-uuid") + assert.Error(t, err) + assert.Contains(t, err.Error(), "UUID") +} + +func TestCheckCrawlStatus_PathTraversalID(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made for path traversal ID") + }) + + _, err := app.CheckCrawlStatus(context.Background(), "../../etc/passwd") + assert.Error(t, err) +} + +func TestCheckCrawlStatus_ServerError(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusInternalServerError, map[string]string{"error": "Internal failure"}) + }) + + _, err := app.CheckCrawlStatus(context.Background(), validCrawlID) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrServerError) +} + +// ---- CancelCrawlJob ---- + +func TestCancelCrawlJob_Success(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodDelete, r.Method) + assert.Equal(t, "/v2/crawl/"+validCrawlID, r.URL.Path) + + respondJSON(w, http.StatusOK, CancelCrawlJobResponse{ + Success: true, + Status: "cancelled", + }) + }) + + status, err := app.CancelCrawlJob(context.Background(), validCrawlID) + require.NoError(t, err) + assert.Equal(t, "cancelled", status) +} + +func TestCancelCrawlJob_InvalidID(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made for invalid ID") + }) + + _, err := app.CancelCrawlJob(context.Background(), "not-a-uuid") + assert.Error(t, err) + assert.Contains(t, err.Error(), "UUID") +} + +func TestCancelCrawlJob_Unauthorized(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusUnauthorized, map[string]string{"error": "Invalid token"}) + }) + + _, err := app.CancelCrawlJob(context.Background(), validCrawlID) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrUnauthorized) +} + +// ---- buildCrawlRequest ---- + +func TestBuildCrawlRequest_NilParams(t *testing.T) { + req, err := buildCrawlRequest("https://example.com", nil) + require.NoError(t, err) + assert.Equal(t, "https://example.com", req.URL) + assert.Nil(t, req.ScrapeOptions) + assert.Nil(t, req.Limit) + assert.Nil(t, req.Webhook) +} + +func TestBuildCrawlRequest_AllParams(t *testing.T) { + params := &CrawlParams{ + Limit: ptr(100), + MaxDiscoveryDepth: ptr(3), + CrawlEntireDomain: ptr(true), + AllowSubdomains: ptr(false), + IncludePaths: []string{"/blog/*"}, + ExcludePaths: []string{"/admin/*"}, + Prompt: ptr("Crawl only article pages"), + RegexOnFullURL: ptr(true), + ZeroDataRetention: ptr(true), + } + + req, err := buildCrawlRequest("https://example.com", params) + require.NoError(t, err) + assert.Equal(t, "https://example.com", req.URL) + assert.Equal(t, 100, *req.Limit) + assert.Equal(t, 3, *req.MaxDiscoveryDepth) + assert.True(t, *req.CrawlEntireDomain) + assert.False(t, *req.AllowSubdomains) + assert.Equal(t, []string{"/blog/*"}, req.IncludePaths) + assert.Equal(t, []string{"/admin/*"}, req.ExcludePaths) + assert.Equal(t, "Crawl only article pages", *req.Prompt) + assert.True(t, *req.RegexOnFullURL) + assert.True(t, *req.ZeroDataRetention) +} + +func TestBuildCrawlRequest_WithScrapeOptions(t *testing.T) { + params := &CrawlParams{ + ScrapeOptions: ScrapeParams{ + Formats: []string{"markdown"}, + OnlyMainContent: ptr(true), + }, + } + + req, err := buildCrawlRequest("https://example.com", params) + require.NoError(t, err) + assert.NotNil(t, req.ScrapeOptions) + assert.Equal(t, []string{"markdown"}, req.ScrapeOptions.Formats) + assert.True(t, *req.ScrapeOptions.OnlyMainContent) +} + +func TestBuildCrawlRequest_EmptyScrapeOptions(t *testing.T) { + // Empty ScrapeOptions should not be included in the request + params := &CrawlParams{ + Limit: ptr(10), + // ScrapeOptions is zero value — should be omitted + } + + req, err := buildCrawlRequest("https://example.com", params) + require.NoError(t, err) + assert.Nil(t, req.ScrapeOptions) + assert.Equal(t, 10, *req.Limit) +} + +// ---- CheckCrawlStatus with PaginationConfig ---- + +func TestCheckCrawlStatus_NoPagination_BackwardCompat(t *testing.T) { + // Calling without pagination parameter returns the single page (backward compatible). + var serverURL string + app, srv := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + next := serverURL + "/v2/crawl/" + validCrawlID + "?cursor=2" + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 2, + Completed: 2, + Data: []*FirecrawlDocument{{Markdown: "# Page 1"}}, + Next: &next, + }) + }) + serverURL = srv.URL + + result, err := app.CheckCrawlStatus(context.Background(), validCrawlID) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) + // Only the first page returned — Next is present but not followed. + assert.Len(t, result.Data, 1) + assert.NotNil(t, result.Next) +} + +func TestCheckCrawlStatus_AutoPaginate_FollowsNextURLs(t *testing.T) { + requestCount := 0 + var serverURL string + app, srv := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + requestCount++ + if requestCount == 1 { + // First page: has a Next URL. + next := serverURL + "/v2/crawl/" + validCrawlID + "?cursor=2" + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 2, + Completed: 2, + Data: []*FirecrawlDocument{{Markdown: "# Page 1"}}, + Next: &next, + }) + return + } + // Second page: no Next URL, pagination ends. + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 2, + Completed: 2, + Data: []*FirecrawlDocument{{Markdown: "# Page 2"}}, + }) + }) + serverURL = srv.URL + + cfg := &PaginationConfig{AutoPaginate: ptr(true)} + result, err := app.CheckCrawlStatus(context.Background(), validCrawlID, cfg) + require.NoError(t, err) + assert.Equal(t, 2, requestCount) + assert.Len(t, result.Data, 2) + assert.Equal(t, "# Page 1", result.Data[0].Markdown) + assert.Equal(t, "# Page 2", result.Data[1].Markdown) +} + +func TestCheckCrawlStatus_MaxPages_StopsAfterLimit(t *testing.T) { + requestCount := 0 + var serverURL string + app, srv := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + requestCount++ + next := serverURL + "/v2/crawl/" + validCrawlID + "?cursor=" + fmt.Sprintf("%d", requestCount+1) + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 10, + Completed: 10, + Data: []*FirecrawlDocument{{Markdown: fmt.Sprintf("# Page %d", requestCount)}}, + Next: &next, + }) + }) + serverURL = srv.URL + + cfg := &PaginationConfig{ + AutoPaginate: ptr(true), + MaxPages: ptr(2), // Stop after 2 pages total. + } + result, err := app.CheckCrawlStatus(context.Background(), validCrawlID, cfg) + require.NoError(t, err) + // Only fetched page 1 (initial) + page 2 stopped by MaxPages limit. + assert.Equal(t, 2, requestCount) + assert.Len(t, result.Data, 2) +} + +func TestCheckCrawlStatus_MaxResults_TruncatesExcess(t *testing.T) { + requestCount := 0 + var serverURL string + app, srv := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + requestCount++ + next := serverURL + "/v2/crawl/" + validCrawlID + "?cursor=2" + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 6, + Completed: 6, + Data: []*FirecrawlDocument{ + {Markdown: "# Doc A"}, + {Markdown: "# Doc B"}, + {Markdown: "# Doc C"}, + }, + Next: &next, + }) + }) + serverURL = srv.URL + + cfg := &PaginationConfig{ + AutoPaginate: ptr(true), + MaxResults: ptr(3), // Stop after collecting 3 results total. + } + result, err := app.CheckCrawlStatus(context.Background(), validCrawlID, cfg) + require.NoError(t, err) + // First page gives 3 docs which meets MaxResults — no second request made. + assert.Equal(t, 1, requestCount) + assert.Len(t, result.Data, 3) +} + +func TestCheckCrawlStatus_AutoPaginate_UnsafeNextURL(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + next := "https://attacker.example.com/steal?cursor=2" + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 2, + Completed: 2, + Data: []*FirecrawlDocument{{Markdown: "# Page 1"}}, + Next: &next, + }) + }) + + cfg := &PaginationConfig{AutoPaginate: ptr(true)} + _, err := app.CheckCrawlStatus(context.Background(), validCrawlID, cfg) + assert.Error(t, err) + assert.Contains(t, err.Error(), "unsafe pagination URL") +} + +// ---- GetCrawlStatusPage ---- + +func TestGetCrawlStatusPage_Success(t *testing.T) { + app, srv := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodGet, r.Method) + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 5, + Completed: 5, + Data: []*FirecrawlDocument{{Markdown: "# Page 2"}}, + }) + }) + + nextURL := srv.URL + "/v2/crawl/" + validCrawlID + "?cursor=2" + result, err := app.GetCrawlStatusPage(context.Background(), nextURL) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) + assert.Len(t, result.Data, 1) + assert.Equal(t, "# Page 2", result.Data[0].Markdown) +} + +func TestGetCrawlStatusPage_InvalidURL_SSRFBlocked(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made to untrusted host") + }) + + _, err := app.GetCrawlStatusPage(context.Background(), "https://attacker.example.com/steal") + assert.Error(t, err) + assert.Contains(t, err.Error(), "unsafe pagination URL") +} diff --git a/errors.go b/errors.go new file mode 100644 index 0000000..3f82d2c --- /dev/null +++ b/errors.go @@ -0,0 +1,118 @@ +package firecrawl + +import ( + "encoding/json" + "errors" + "fmt" +) + +// Sentinel errors for programmatic error handling via errors.Is(). +var ( + // ErrNoAPIKey is returned when no API key is provided to the constructor. + ErrNoAPIKey = errors.New("no API key provided") + + // ErrUnauthorized is returned for HTTP 401 responses. + ErrUnauthorized = errors.New("unauthorized") + + // ErrPaymentRequired is returned for HTTP 402 responses. + ErrPaymentRequired = errors.New("payment required") + + // ErrNotFound is returned for HTTP 404 responses. + ErrNotFound = errors.New("not found") + + // ErrTimeout is returned for HTTP 408 responses. + ErrTimeout = errors.New("request timeout") + + // ErrConflict is returned for HTTP 409 responses. + ErrConflict = errors.New("conflict") + + // ErrRateLimited is returned for HTTP 429 responses. + ErrRateLimited = errors.New("rate limit exceeded") + + // ErrServerError is returned for HTTP 500 responses. + ErrServerError = errors.New("internal server error") +) + +// APIError represents a structured error from the Firecrawl API. +// It wraps a sentinel error based on the HTTP status code, enabling +// programmatic error handling via errors.Is() and errors.As(). +// +// Example usage: +// +// _, err := app.ScrapeURL(ctx, url, nil) +// if errors.Is(err, firecrawl.ErrRateLimited) { +// // back off and retry +// } +// +// var apiErr *firecrawl.APIError +// if errors.As(err, &apiErr) { +// log.Printf("API error %d during %s: %s", apiErr.StatusCode, apiErr.Action, apiErr.Message) +// } +type APIError struct { + // StatusCode is the HTTP status code from the API response. + StatusCode int + // Message is the error message from the API response body. + Message string + // Action is the SDK operation that triggered the error (e.g., "scrape URL", "start crawl job"). + Action string +} + +// Error returns a human-readable error string. +func (e *APIError) Error() string { + return fmt.Sprintf("API error %d during %s: %s", e.StatusCode, e.Action, e.Message) +} + +// Unwrap returns the sentinel error corresponding to the HTTP status code. +// This enables errors.Is(err, firecrawl.ErrRateLimited) and similar checks. +func (e *APIError) Unwrap() error { + switch e.StatusCode { + case 401: + return ErrUnauthorized + case 402: + return ErrPaymentRequired + case 404: + return ErrNotFound + case 408: + return ErrTimeout + case 409: + return ErrConflict + case 429: + return ErrRateLimited + case 500: + return ErrServerError + default: + return nil + } +} + +// handleError constructs an *APIError from an HTTP status code and response body. +// +// Parameters: +// - statusCode: The HTTP status code from the response. +// - body: The raw response body bytes. +// - action: A string describing the SDK operation being performed. +// +// Returns: +// - error: An *APIError wrapping the appropriate sentinel for the status code. +func (app *FirecrawlApp) handleError(statusCode int, body []byte, action string) error { + var errorData map[string]any + err := json.Unmarshal(body, &errorData) + if err != nil { + return &APIError{ + StatusCode: statusCode, + Message: fmt.Sprintf("failed to parse error response: %v", err), + Action: action, + } + } + + errorMessage, _ := errorData["error"].(string) + if errorMessage == "" { + errorMessage = "No additional error details provided." + } + + return &APIError{ + StatusCode: statusCode, + Message: errorMessage, + Action: action, + } +} diff --git a/errors_test.go b/errors_test.go new file mode 100644 index 0000000..1b0f8dc --- /dev/null +++ b/errors_test.go @@ -0,0 +1,69 @@ +package firecrawl + +import ( + "errors" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestHandleError_StatusCodes(t *testing.T) { + tests := []struct { + name string + statusCode int + body string + wantSentinel error + }{ + {"401 Unauthorized", 401, `{"error": "Invalid token"}`, ErrUnauthorized}, + {"402 Payment Required", 402, `{"error": "Insufficient credits"}`, ErrPaymentRequired}, + {"404 Not Found", 404, `{"error": "Resource not found"}`, ErrNotFound}, + {"408 Timeout", 408, `{"error": "Timed out"}`, ErrTimeout}, + {"409 Conflict", 409, `{"error": "Duplicate request"}`, ErrConflict}, + {"429 Rate Limited", 429, `{"error": "Too many requests"}`, ErrRateLimited}, + {"500 Server Error", 500, `{"error": "Internal error"}`, ErrServerError}, + } + + app := &FirecrawlApp{} + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := app.handleError(tt.statusCode, []byte(tt.body), "test action") + assert.Error(t, err) + assert.True(t, errors.Is(err, tt.wantSentinel), "expected errors.Is to match %v", tt.wantSentinel) + + var apiErr *APIError + assert.True(t, errors.As(err, &apiErr)) + assert.Equal(t, tt.statusCode, apiErr.StatusCode) + assert.Equal(t, "test action", apiErr.Action) + }) + } +} + +func TestHandleError_InvalidJSON(t *testing.T) { + app := &FirecrawlApp{} + err := app.handleError(500, []byte("not json"), "test action") + assert.Error(t, err) + + var apiErr *APIError + assert.True(t, errors.As(err, &apiErr)) + assert.Equal(t, 500, apiErr.StatusCode) + assert.Contains(t, apiErr.Message, "failed to parse") +} + +func TestHandleError_UnknownStatusCode(t *testing.T) { + app := &FirecrawlApp{} + err := app.handleError(418, []byte(`{"error": "I am a teapot"}`), "brew coffee") + assert.Error(t, err) + + var apiErr *APIError + assert.True(t, errors.As(err, &apiErr)) + assert.Equal(t, 418, apiErr.StatusCode) + // Unknown status should have nil Unwrap (no sentinel) + assert.Nil(t, apiErr.Unwrap()) +} + +func TestAPIError_ErrorMessage(t *testing.T) { + err := &APIError{StatusCode: 401, Message: "Invalid token", Action: "scrape URL"} + assert.Contains(t, err.Error(), "scrape URL") + assert.Contains(t, err.Error(), "401") + assert.Contains(t, err.Error(), "Invalid token") +} diff --git a/extract.go b/extract.go new file mode 100644 index 0000000..a8517db --- /dev/null +++ b/extract.go @@ -0,0 +1,188 @@ +package firecrawl + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "time" +) + +// extractRequest is the internal request struct for extract operations. +// It is unexported — callers use ExtractParams instead. +type extractRequest struct { + URLs []string `json:"urls"` + Prompt *string `json:"prompt,omitempty"` + Schema map[string]any `json:"schema,omitempty"` + EnableWebSearch *bool `json:"enableWebSearch,omitempty"` + IgnoreSitemap *bool `json:"ignoreSitemap,omitempty"` + IncludeSubdomains *bool `json:"includeSubdomains,omitempty"` + ShowSources *bool `json:"showSources,omitempty"` + IgnoreInvalidURLs *bool `json:"ignoreInvalidURLs,omitempty"` + ScrapeOptions *ScrapeParams `json:"scrapeOptions,omitempty"` +} + +// Extract performs LLM-based structured data extraction and polls until completion. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - urls: The list of URLs to extract data from. +// - params: Optional parameters for the extraction request. +// +// Returns: +// - *ExtractStatusResponse: The extraction result with structured data. +// - error: An error if the extraction fails. +func (app *FirecrawlApp) Extract(ctx context.Context, urls []string, params *ExtractParams) (*ExtractStatusResponse, error) { + response, err := app.AsyncExtract(ctx, urls, params) + if err != nil { + return nil, err + } + + headers := app.prepareHeaders(nil) + return app.monitorExtractStatus(ctx, response.ID, headers) +} + +// AsyncExtract starts an extraction job asynchronously. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - urls: The list of URLs to extract data from. +// - params: Optional parameters for the extraction request. +// +// Returns: +// - *ExtractResponse: The response with job ID for polling. +// - error: An error if starting the extraction fails. +func (app *FirecrawlApp) AsyncExtract(ctx context.Context, urls []string, params *ExtractParams) (*ExtractResponse, error) { + headers := app.prepareHeaders(nil) + + req := extractRequest{URLs: urls} + if params != nil { + req.Prompt = params.Prompt + req.Schema = params.Schema + req.EnableWebSearch = params.EnableWebSearch + req.IgnoreSitemap = params.IgnoreSitemap + req.IncludeSubdomains = params.IncludeSubdomains + req.ShowSources = params.ShowSources + req.IgnoreInvalidURLs = params.IgnoreInvalidURLs + req.ScrapeOptions = params.ScrapeOptions + } + + body, err := json.Marshal(req) + if err != nil { + return nil, fmt.Errorf("failed to marshal extract request: %w", err) + } + + resp, err := app.makeRequest( + ctx, + http.MethodPost, + fmt.Sprintf("%s/v2/extract", app.APIURL), + body, + headers, + "start extract job", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + var extractResponse ExtractResponse + if err := json.Unmarshal(resp, &extractResponse); err != nil { + return nil, fmt.Errorf("failed to parse extract response: %w", err) + } + + if extractResponse.ID == "" { + return nil, fmt.Errorf("failed to get extract job ID") + } + + return &extractResponse, nil +} + +// CheckExtractStatus checks the status of an extraction job. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - id: The ID of the extraction job to check. +// +// Returns: +// - *ExtractStatusResponse: The current status of the extraction job. +// - error: An error if the status check fails. +func (app *FirecrawlApp) CheckExtractStatus(ctx context.Context, id string) (*ExtractStatusResponse, error) { + if err := validateJobID(id); err != nil { + return nil, err + } + + headers := app.prepareHeaders(nil) + + resp, err := app.makeRequest( + ctx, + http.MethodGet, + fmt.Sprintf("%s/v2/extract/%s", app.APIURL, id), + nil, + headers, + "check extract status", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + var statusResponse ExtractStatusResponse + if err := json.Unmarshal(resp, &statusResponse); err != nil { + return nil, fmt.Errorf("failed to parse extract status response: %w", err) + } + + return &statusResponse, nil +} + +// monitorExtractStatus polls an extraction job until completion. +// Unlike crawl/batch, extract uses "processing" status and has no pagination. +func (app *FirecrawlApp) monitorExtractStatus(ctx context.Context, id string, headers map[string]string) (*ExtractStatusResponse, error) { + pollInterval := 2 + + for { + if ctx.Err() != nil { + return nil, ctx.Err() + } + + resp, err := app.makeRequest( + ctx, + http.MethodGet, + fmt.Sprintf("%s/v2/extract/%s", app.APIURL, id), + nil, + headers, + "check extract status", + withRetries(3), + withBackoff(500), + ) + if err != nil { + return nil, err + } + + var statusData ExtractStatusResponse + if err := json.Unmarshal(resp, &statusData); err != nil { + return nil, err + } + + status := statusData.Status + if status == "" { + return nil, fmt.Errorf("invalid status in extract response") + } + + switch status { + case "completed": + return &statusData, nil + case "processing": + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(time.Duration(pollInterval) * time.Second): + } + case "failed": + return nil, fmt.Errorf("extract job failed. Status: %s", status) + default: + return nil, fmt.Errorf("unknown extract status: %s", status) + } + } +} diff --git a/extract_test.go b/extract_test.go new file mode 100644 index 0000000..90a5c07 --- /dev/null +++ b/extract_test.go @@ -0,0 +1,313 @@ +package firecrawl + +import ( + "context" + "net/http" + "sync/atomic" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// validExtractID is a valid UUID used across extract tests. +const validExtractID = "660e8400-e29b-41d4-a716-446655440002" + +// ---- AsyncExtract ---- + +func TestAsyncExtract_Success(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodPost, r.Method) + assert.Equal(t, "/v2/extract", r.URL.Path) + + var body map[string]any + decodeJSONBody(t, r, &body) + urls, ok := body["urls"].([]any) + require.True(t, ok) + assert.Equal(t, "https://example.com", urls[0]) + + respondJSON(w, http.StatusOK, ExtractResponse{ + Success: true, + ID: validExtractID, + }) + }) + + result, err := app.AsyncExtract(context.Background(), []string{"https://example.com"}, nil) + require.NoError(t, err) + assert.Equal(t, validExtractID, result.ID) + assert.True(t, result.Success) +} + +func TestAsyncExtract_WithParams(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + var body map[string]any + decodeJSONBody(t, r, &body) + + urls, ok := body["urls"].([]any) + require.True(t, ok) + assert.Len(t, urls, 2) + + assert.NotNil(t, body["prompt"]) + assert.NotNil(t, body["schema"]) + assert.NotNil(t, body["enableWebSearch"]) + assert.NotNil(t, body["ignoreSitemap"]) + assert.NotNil(t, body["includeSubdomains"]) + assert.NotNil(t, body["showSources"]) + assert.NotNil(t, body["ignoreInvalidURLs"]) + assert.NotNil(t, body["scrapeOptions"]) + + respondJSON(w, http.StatusOK, ExtractResponse{ + Success: true, + ID: validExtractID, + }) + }) + + schema := map[string]any{ + "type": "object", + "properties": map[string]any{ + "name": map[string]any{"type": "string"}, + }, + } + params := &ExtractParams{ + Prompt: ptr("Extract the company name"), + Schema: schema, + EnableWebSearch: ptr(true), + IgnoreSitemap: ptr(false), + IncludeSubdomains: ptr(true), + ShowSources: ptr(true), + IgnoreInvalidURLs: ptr(true), + ScrapeOptions: &ScrapeParams{ + Formats: []string{"markdown"}, + }, + } + + result, err := app.AsyncExtract( + context.Background(), + []string{"https://example.com", "https://example.org"}, + params, + ) + require.NoError(t, err) + assert.Equal(t, validExtractID, result.ID) +} + +func TestAsyncExtract_MissingID(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, ExtractResponse{ + Success: true, + ID: "", // Missing ID + }) + }) + + _, err := app.AsyncExtract(context.Background(), []string{"https://example.com"}, nil) + assert.Error(t, err) + assert.Contains(t, err.Error(), "job ID") +} + +func TestAsyncExtract_Unauthorized(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusUnauthorized, map[string]string{"error": "Invalid token"}) + }) + + _, err := app.AsyncExtract(context.Background(), []string{"https://example.com"}, nil) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrUnauthorized) +} + +// ---- Extract ---- + +func TestExtract_PollsUntilComplete(t *testing.T) { + var requestCount atomic.Int32 + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + count := requestCount.Add(1) + if r.Method == http.MethodPost && r.URL.Path == "/v2/extract" { + respondJSON(w, http.StatusOK, ExtractResponse{ + Success: true, + ID: validExtractID, + }) + return + } + // First GET returns "processing", subsequent returns "completed". + if count == 2 { + respondJSON(w, http.StatusOK, ExtractStatusResponse{ + Status: "processing", + }) + return + } + respondJSON(w, http.StatusOK, ExtractStatusResponse{ + Status: "completed", + Success: true, + CreditsUsed: 2, + Data: map[string]any{ + "name": "Acme Corp", + }, + }) + }) + + result, err := app.Extract(context.Background(), []string{"https://example.com"}, nil) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) + assert.NotNil(t, result.Data) + assert.Equal(t, "Acme Corp", result.Data["name"]) +} + +func TestExtract_ContextCancelled(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() // Cancel immediately before any request + + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made with cancelled context") + }) + + _, err := app.Extract(ctx, []string{"https://example.com"}, nil) + assert.Error(t, err) + assert.ErrorIs(t, err, context.Canceled) +} + +func TestExtract_Failed(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodPost { + respondJSON(w, http.StatusOK, ExtractResponse{Success: true, ID: validExtractID}) + return + } + respondJSON(w, http.StatusOK, ExtractStatusResponse{Status: "failed"}) + }) + + _, err := app.Extract(context.Background(), []string{"https://example.com"}, nil) + assert.Error(t, err) + assert.Contains(t, err.Error(), "failed") +} + +// ---- CheckExtractStatus ---- + +func TestCheckExtractStatus_Success(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodGet, r.Method) + assert.Equal(t, "/v2/extract/"+validExtractID, r.URL.Path) + + respondJSON(w, http.StatusOK, ExtractStatusResponse{ + Success: true, + Status: "completed", + CreditsUsed: 5, + ExpiresAt: "2026-04-15T00:00:00Z", + Data: map[string]any{ + "company": "Acme Corp", + "founded": float64(1990), + }, + }) + }) + + result, err := app.CheckExtractStatus(context.Background(), validExtractID) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) + assert.True(t, result.Success) + assert.Equal(t, 5, result.CreditsUsed) + assert.Equal(t, "2026-04-15T00:00:00Z", result.ExpiresAt) + assert.Equal(t, "Acme Corp", result.Data["company"]) +} + +func TestCheckExtractStatus_Processing(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, ExtractStatusResponse{ + Status: "processing", + }) + }) + + result, err := app.CheckExtractStatus(context.Background(), validExtractID) + require.NoError(t, err) + assert.Equal(t, "processing", result.Status) +} + +func TestCheckExtractStatus_InvalidID(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made for invalid ID") + }) + + _, err := app.CheckExtractStatus(context.Background(), "not-a-uuid") + assert.Error(t, err) + assert.Contains(t, err.Error(), "UUID") +} + +func TestCheckExtractStatus_PathTraversalID(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made for path traversal ID") + }) + + _, err := app.CheckExtractStatus(context.Background(), "../../etc/passwd") + assert.Error(t, err) +} + +func TestCheckExtractStatus_Unauthorized(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusUnauthorized, map[string]string{"error": "Invalid token"}) + }) + + _, err := app.CheckExtractStatus(context.Background(), validExtractID) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrUnauthorized) +} + +// ---- monitorExtractStatus ---- + +func TestMonitorExtractStatus_CompletedImmediately(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodGet, r.Method) + respondJSON(w, http.StatusOK, ExtractStatusResponse{ + Status: "completed", + Success: true, + Data: map[string]any{"result": "value"}, + }) + }) + + headers := app.prepareHeaders(nil) + result, err := app.monitorExtractStatus(context.Background(), validExtractID, headers) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) + assert.Equal(t, "value", result.Data["result"]) +} + +func TestMonitorExtractStatus_Failed(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, ExtractStatusResponse{Status: "failed"}) + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorExtractStatus(context.Background(), validExtractID, headers) + assert.Error(t, err) + assert.Contains(t, err.Error(), "failed") +} + +func TestMonitorExtractStatus_UnknownStatus(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, ExtractStatusResponse{Status: "pending"}) + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorExtractStatus(context.Background(), validExtractID, headers) + assert.Error(t, err) + assert.Contains(t, err.Error(), "unknown extract status") +} + +func TestMonitorExtractStatus_EmptyStatus(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, ExtractStatusResponse{Status: ""}) + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorExtractStatus(context.Background(), validExtractID, headers) + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid status") +} + +func TestMonitorExtractStatus_ContextCancelledBeforeRequest(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made with cancelled context") + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorExtractStatus(ctx, validExtractID, headers) + assert.Error(t, err) + assert.ErrorIs(t, err, context.Canceled) +} diff --git a/firecrawl.go b/firecrawl.go index 695dc40..77f5d87 100644 --- a/firecrawl.go +++ b/firecrawl.go @@ -1,837 +1,2 @@ // Package firecrawl provides a client for interacting with the Firecrawl API. package firecrawl - -import ( - "bytes" - "encoding/json" - "fmt" - "io" - "math" - "net/http" - "os" - "time" -) - -type StringOrStringSlice []string - -func (s *StringOrStringSlice) UnmarshalJSON(data []byte) error { - var single string - if err := json.Unmarshal(data, &single); err == nil { - *s = []string{single} - return nil - } - - var list []string - if err := json.Unmarshal(data, &list); err == nil { - *s = list - return nil - } - - return fmt.Errorf("field is neither a string nor a list of strings") -} - -// FirecrawlDocumentMetadata represents metadata for a Firecrawl document -type FirecrawlDocumentMetadata struct { - Title *string `json:"title,omitempty"` - Description *StringOrStringSlice `json:"description,omitempty"` - Language *StringOrStringSlice `json:"language,omitempty"` - Keywords *StringOrStringSlice `json:"keywords,omitempty"` - Robots *StringOrStringSlice `json:"robots,omitempty"` - OGTitle *StringOrStringSlice `json:"ogTitle,omitempty"` - OGDescription *StringOrStringSlice `json:"ogDescription,omitempty"` - OGURL *StringOrStringSlice `json:"ogUrl,omitempty"` - OGImage *StringOrStringSlice `json:"ogImage,omitempty"` - OGAudio *StringOrStringSlice `json:"ogAudio,omitempty"` - OGDeterminer *StringOrStringSlice `json:"ogDeterminer,omitempty"` - OGLocale *StringOrStringSlice `json:"ogLocale,omitempty"` - OGLocaleAlternate []*string `json:"ogLocaleAlternate,omitempty"` - OGSiteName *StringOrStringSlice `json:"ogSiteName,omitempty"` - OGVideo *StringOrStringSlice `json:"ogVideo,omitempty"` - DCTermsCreated *StringOrStringSlice `json:"dctermsCreated,omitempty"` - DCDateCreated *StringOrStringSlice `json:"dcDateCreated,omitempty"` - DCDate *StringOrStringSlice `json:"dcDate,omitempty"` - DCTermsType *StringOrStringSlice `json:"dctermsType,omitempty"` - DCType *StringOrStringSlice `json:"dcType,omitempty"` - DCTermsAudience *StringOrStringSlice `json:"dctermsAudience,omitempty"` - DCTermsSubject *StringOrStringSlice `json:"dctermsSubject,omitempty"` - DCSubject *StringOrStringSlice `json:"dcSubject,omitempty"` - DCDescription *StringOrStringSlice `json:"dcDescription,omitempty"` - DCTermsKeywords *StringOrStringSlice `json:"dctermsKeywords,omitempty"` - ModifiedTime *StringOrStringSlice `json:"modifiedTime,omitempty"` - PublishedTime *StringOrStringSlice `json:"publishedTime,omitempty"` - ArticleTag *StringOrStringSlice `json:"articleTag,omitempty"` - ArticleSection *StringOrStringSlice `json:"articleSection,omitempty"` - URL *string `json:"url,omitempty"` - ScrapeID *string `json:"scrapeId,omitempty"` - SourceURL *string `json:"sourceURL,omitempty"` - StatusCode *int `json:"statusCode,omitempty"` - Error *string `json:"error,omitempty"` -} - -// JsonOptions represents the options for JSON extraction -type JsonOptions struct { - Schema map[string]any `json:"schema,omitempty"` - SystemPrompt *string `json:"systemPrompt,omitempty"` - Prompt *string `json:"prompt,omitempty"` -} - -// FirecrawlDocument represents a document in Firecrawl -type FirecrawlDocument struct { - Markdown string `json:"markdown,omitempty"` - HTML string `json:"html,omitempty"` - RawHTML string `json:"rawHtml,omitempty"` - Screenshot string `json:"screenshot,omitempty"` - JSON map[string]any `json:"json,omitempty"` - Links []string `json:"links,omitempty"` - Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"` -} - -// ScrapeParams represents the parameters for a scrape request. -type ScrapeParams struct { - Formats []string `json:"formats,omitempty"` - Headers *map[string]string `json:"headers,omitempty"` - IncludeTags []string `json:"includeTags,omitempty"` - ExcludeTags []string `json:"excludeTags,omitempty"` - OnlyMainContent *bool `json:"onlyMainContent,omitempty"` - WaitFor *int `json:"waitFor,omitempty"` - ParsePDF *bool `json:"parsePDF,omitempty"` - Timeout *int `json:"timeout,omitempty"` - MaxAge *int `json:"maxAge,omitempty"` - JsonOptions *JsonOptions `json:"jsonOptions,omitempty"` -} - -// ScrapeResponse represents the response for scraping operations -type ScrapeResponse struct { - Success bool `json:"success"` - Data *FirecrawlDocument `json:"data,omitempty"` -} - -// CrawlParams represents the parameters for a crawl request. -type CrawlParams struct { - ScrapeOptions ScrapeParams `json:"scrapeOptions"` - Webhook *string `json:"webhook,omitempty"` - Limit *int `json:"limit,omitempty"` - IncludePaths []string `json:"includePaths,omitempty"` - ExcludePaths []string `json:"excludePaths,omitempty"` - MaxDepth *int `json:"maxDepth,omitempty"` - AllowBackwardLinks *bool `json:"allowBackwardLinks,omitempty"` - AllowExternalLinks *bool `json:"allowExternalLinks,omitempty"` - IgnoreSitemap *bool `json:"ignoreSitemap,omitempty"` - IgnoreQueryParameters *bool `json:"ignoreQueryParameters,omitempty"` -} - -// CrawlResponse represents the response for crawling operations -type CrawlResponse struct { - Success bool `json:"success"` - ID string `json:"id,omitempty"` - URL string `json:"url,omitempty"` -} - -// CrawlStatusResponse (old JobStatusResponse) represents the response for checking crawl job -type CrawlStatusResponse struct { - Status string `json:"status"` - Total int `json:"total,omitempty"` - Completed int `json:"completed,omitempty"` - CreditsUsed int `json:"creditsUsed,omitempty"` - ExpiresAt string `json:"expiresAt,omitempty"` - Next *string `json:"next,omitempty"` - Data []*FirecrawlDocument `json:"data,omitempty"` -} - -// CancelCrawlJobResponse represents the response for canceling a crawl job -type CancelCrawlJobResponse struct { - Success bool `json:"success"` - Status string `json:"status"` -} - -// MapParams represents the parameters for a map request. -type MapParams struct { - IncludeSubdomains *bool `json:"includeSubdomains,omitempty"` - Search *string `json:"search,omitempty"` - IgnoreSitemap *bool `json:"ignoreSitemap,omitempty"` - Limit *int `json:"limit,omitempty"` -} - -// MapResponse represents the response for mapping operations -type MapResponse struct { - Success bool `json:"success"` - Links []string `json:"links,omitempty"` - Error string `json:"error,omitempty"` -} - -// requestOptions represents options for making requests. -type requestOptions struct { - retries int - backoff int -} - -// requestOption is a functional option type for requestOptions. -type requestOption func(*requestOptions) - -// newRequestOptions creates a new requestOptions instance with the provided options. -// -// Parameters: -// - opts: Optional request options. -// -// Returns: -// - *requestOptions: A new instance of requestOptions with the provided options. -func newRequestOptions(opts ...requestOption) *requestOptions { - options := &requestOptions{retries: 1} - for _, opt := range opts { - opt(options) - } - return options -} - -// withRetries sets the number of retries for a request. -// -// Parameters: -// - retries: The number of retries to be performed. -// -// Returns: -// - requestOption: A functional option that sets the number of retries for a request. -func withRetries(retries int) requestOption { - return func(opts *requestOptions) { - opts.retries = retries - } -} - -// withBackoff sets the backoff interval for a request. -// -// Parameters: -// - backoff: The backoff interval (in milliseconds) to be used for retries. -// -// Returns: -// - requestOption: A functional option that sets the backoff interval for a request. -func withBackoff(backoff int) requestOption { - return func(opts *requestOptions) { - opts.backoff = backoff - } -} - -// FirecrawlApp represents a client for the Firecrawl API. -type FirecrawlApp struct { - APIKey string - APIURL string - Client *http.Client - Version string -} - -// NewFirecrawlApp creates a new instance of FirecrawlApp with the provided API key and API URL. -// If the API key or API URL is not provided, it attempts to retrieve them from environment variables. -// If the API key is still not found, it returns an error. -// -// Parameters: -// - apiKey: The API key for authenticating with the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_KEY environment variable. -// - apiURL: The base URL for the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_URL environment variable, defaulting to "https://api.firecrawl.dev". -// - timeout: The timeout for the HTTP client. If not provided, it will default to 60 seconds. -// -// Returns: -// - *FirecrawlApp: A new instance of FirecrawlApp configured with the provided or retrieved API key and API URL. -// - error: An error if the API key is not provided or retrieved. -func NewFirecrawlApp(apiKey, apiURL string, timeout ...time.Duration) (*FirecrawlApp, error) { - if apiKey == "" { - apiKey = os.Getenv("FIRECRAWL_API_KEY") - if apiKey == "" { - return nil, fmt.Errorf("no API key provided") - } - } - - if apiURL == "" { - apiURL = os.Getenv("FIRECRAWL_API_URL") - if apiURL == "" { - apiURL = "https://api.firecrawl.dev" - } - } - - t := 120 * time.Second // default - if len(timeout) > 0 { - t = timeout[0] - } - - client := &http.Client{ - Timeout: t, - Transport: http.DefaultTransport, - } - - return &FirecrawlApp{ - APIKey: apiKey, - APIURL: apiURL, - Client: client, - }, nil -} - -// ScrapeURL scrapes the content of the specified URL using the Firecrawl API. -// -// Parameters: -// - url: The URL to be scraped. -// - params: Optional parameters for the scrape request, including extractor options for LLM extraction. -// -// Returns: -// - *FirecrawlDocument or *FirecrawlDocumentV0: The scraped document data depending on the API version. -// - error: An error if the scrape request fails. -func (app *FirecrawlApp) ScrapeURL(url string, params *ScrapeParams) (*FirecrawlDocument, error) { - headers := app.prepareHeaders(nil) - scrapeBody := map[string]any{"url": url} - - // if params != nil { - // if extractorOptions, ok := params["extractorOptions"].(ExtractorOptions); ok { - // if schema, ok := extractorOptions.ExtractionSchema.(interface{ schema() any }); ok { - // extractorOptions.ExtractionSchema = schema.schema() - // } - // if extractorOptions.Mode == "" { - // extractorOptions.Mode = "llm-extraction" - // } - // scrapeBody["extractorOptions"] = extractorOptions - // } - - // for key, value := range params { - // if key != "extractorOptions" { - // scrapeBody[key] = value - // } - // } - // } - - if params != nil { - if params.Formats != nil { - scrapeBody["formats"] = params.Formats - } - if params.Headers != nil { - scrapeBody["headers"] = params.Headers - } - if params.IncludeTags != nil { - scrapeBody["includeTags"] = params.IncludeTags - } - if params.ExcludeTags != nil { - scrapeBody["excludeTags"] = params.ExcludeTags - } - if params.OnlyMainContent != nil { - scrapeBody["onlyMainContent"] = params.OnlyMainContent - } - if params.WaitFor != nil { - scrapeBody["waitFor"] = params.WaitFor - } - if params.ParsePDF != nil { - scrapeBody["parsePDF"] = params.ParsePDF - } - if params.Timeout != nil { - scrapeBody["timeout"] = params.Timeout - } - if params.MaxAge != nil { - scrapeBody["maxAge"] = params.MaxAge - } - if params.JsonOptions != nil { - scrapeBody["jsonOptions"] = params.JsonOptions - } - } - - resp, err := app.makeRequest( - http.MethodPost, - fmt.Sprintf("%s/v1/scrape", app.APIURL), - scrapeBody, - headers, - "scrape URL", - ) - if err != nil { - return nil, err - } - - var scrapeResponse ScrapeResponse - err = json.Unmarshal(resp, &scrapeResponse) - - if scrapeResponse.Success { - return scrapeResponse.Data, nil - } - - if err != nil { - return nil, err - } - - return nil, fmt.Errorf("failed to scrape URL") -} - -// CrawlURL starts a crawl job for the specified URL using the Firecrawl API. -// -// Parameters: -// - url: The URL to crawl. -// - params: Optional parameters for the crawl request. -// - idempotencyKey: An optional idempotency key to ensure the request is idempotent (can be nil). -// - pollInterval: An optional interval (in seconds) at which to poll the job status. Default is 2 seconds. -// -// Returns: -// - CrawlStatusResponse: The crawl result if the job is completed. -// - error: An error if the crawl request fails. -func (app *FirecrawlApp) CrawlURL(url string, params *CrawlParams, idempotencyKey *string, pollInterval ...int) (*CrawlStatusResponse, error) { - var key string - if idempotencyKey != nil { - key = *idempotencyKey - } - - headers := app.prepareHeaders(&key) - crawlBody := map[string]any{"url": url} - - if params != nil { - if params.ScrapeOptions.Formats != nil { - crawlBody["scrapeOptions"] = params.ScrapeOptions - } - if params.Webhook != nil { - crawlBody["webhook"] = params.Webhook - } - if params.Limit != nil { - crawlBody["limit"] = params.Limit - } - if params.IncludePaths != nil { - crawlBody["includePaths"] = params.IncludePaths - } - if params.ExcludePaths != nil { - crawlBody["excludePaths"] = params.ExcludePaths - } - if params.MaxDepth != nil { - crawlBody["maxDepth"] = params.MaxDepth - } - if params.AllowBackwardLinks != nil { - crawlBody["allowBackwardLinks"] = params.AllowBackwardLinks - } - if params.AllowExternalLinks != nil { - crawlBody["allowExternalLinks"] = params.AllowExternalLinks - } - if params.IgnoreSitemap != nil { - crawlBody["ignoreSitemap"] = params.IgnoreSitemap - } - if params.IgnoreQueryParameters != nil { - crawlBody["ignoreQueryParameters"] = params.IgnoreQueryParameters - } - } - - actualPollInterval := 2 - if len(pollInterval) > 0 { - actualPollInterval = pollInterval[0] - } - - resp, err := app.makeRequest( - http.MethodPost, - fmt.Sprintf("%s/v1/crawl", app.APIURL), - crawlBody, - headers, - "start crawl job", - withRetries(3), - withBackoff(500), - ) - if err != nil { - return nil, err - } - - var crawlResponse CrawlResponse - err = json.Unmarshal(resp, &crawlResponse) - if err != nil { - return nil, err - } - - return app.monitorJobStatus(crawlResponse.ID, headers, actualPollInterval) -} - -// CrawlURL starts a crawl job for the specified URL using the Firecrawl API. -// -// Parameters: -// - url: The URL to crawl. -// - params: Optional parameters for the crawl request. -// - idempotencyKey: An optional idempotency key to ensure the request is idempotent. -// -// Returns: -// - *CrawlResponse: The crawl response with id. -// - error: An error if the crawl request fails. -func (app *FirecrawlApp) AsyncCrawlURL(url string, params *CrawlParams, idempotencyKey *string) (*CrawlResponse, error) { - var key string - if idempotencyKey != nil { - key = *idempotencyKey - } - - headers := app.prepareHeaders(&key) - crawlBody := map[string]any{"url": url} - - if params != nil { - if params.ScrapeOptions.Formats != nil { - crawlBody["scrapeOptions"] = params.ScrapeOptions - } - if params.Webhook != nil { - crawlBody["webhook"] = params.Webhook - } - if params.Limit != nil { - crawlBody["limit"] = params.Limit - } - if params.IncludePaths != nil { - crawlBody["includePaths"] = params.IncludePaths - } - if params.ExcludePaths != nil { - crawlBody["excludePaths"] = params.ExcludePaths - } - if params.MaxDepth != nil { - crawlBody["maxDepth"] = params.MaxDepth - } - if params.AllowBackwardLinks != nil { - crawlBody["allowBackwardLinks"] = params.AllowBackwardLinks - } - if params.AllowExternalLinks != nil { - crawlBody["allowExternalLinks"] = params.AllowExternalLinks - } - if params.IgnoreSitemap != nil { - crawlBody["ignoreSitemap"] = params.IgnoreSitemap - } - if params.IgnoreQueryParameters != nil { - crawlBody["ignoreQueryParameters"] = params.IgnoreQueryParameters - } - } - - resp, err := app.makeRequest( - http.MethodPost, - fmt.Sprintf("%s/v1/crawl", app.APIURL), - crawlBody, - headers, - "start crawl job", - withRetries(3), - withBackoff(500), - ) - - if err != nil { - return nil, err - } - - var crawlResponse CrawlResponse - err = json.Unmarshal(resp, &crawlResponse) - if err != nil { - return nil, err - } - - if crawlResponse.ID == "" { - return nil, fmt.Errorf("failed to get job ID") - } - - return &crawlResponse, nil -} - -// CheckCrawlStatus checks the status of a crawl job using the Firecrawl API. -// -// Parameters: -// - ID: The ID of the crawl job to check. -// -// Returns: -// - *CrawlStatusResponse: The status of the crawl job. -// - error: An error if the crawl status check request fails. -func (app *FirecrawlApp) CheckCrawlStatus(ID string) (*CrawlStatusResponse, error) { - headers := app.prepareHeaders(nil) - apiURL := fmt.Sprintf("%s/v1/crawl/%s", app.APIURL, ID) - - resp, err := app.makeRequest( - http.MethodGet, - apiURL, - nil, - headers, - "check crawl status", - withRetries(3), - withBackoff(500), - ) - if err != nil { - return nil, err - } - - var jobStatusResponse CrawlStatusResponse - err = json.Unmarshal(resp, &jobStatusResponse) - if err != nil { - return nil, err - } - - return &jobStatusResponse, nil -} - -// CancelCrawlJob cancels a crawl job using the Firecrawl API. -// -// Parameters: -// - ID: The ID of the crawl job to cancel. -// -// Returns: -// - string: The status of the crawl job after cancellation. -// - error: An error if the crawl job cancellation request fails. -func (app *FirecrawlApp) CancelCrawlJob(ID string) (string, error) { - headers := app.prepareHeaders(nil) - apiURL := fmt.Sprintf("%s/v1/crawl/%s", app.APIURL, ID) - resp, err := app.makeRequest( - http.MethodDelete, - apiURL, - nil, - headers, - "cancel crawl job", - ) - if err != nil { - return "", err - } - - var cancelCrawlJobResponse CancelCrawlJobResponse - err = json.Unmarshal(resp, &cancelCrawlJobResponse) - if err != nil { - return "", err - } - - return cancelCrawlJobResponse.Status, nil -} - -// MapURL initiates a mapping operation for a URL using the Firecrawl API. -// -// Parameters: -// - url: The URL to map. -// - params: Optional parameters for the mapping request. -// -// Returns: -// - *MapResponse: The response from the mapping operation. -// - error: An error if the mapping request fails. -func (app *FirecrawlApp) MapURL(url string, params *MapParams) (*MapResponse, error) { - headers := app.prepareHeaders(nil) - jsonData := map[string]any{"url": url} - - if params != nil { - if params.IncludeSubdomains != nil { - jsonData["includeSubdomains"] = params.IncludeSubdomains - } - if params.Search != nil { - jsonData["search"] = params.Search - } - if params.IgnoreSitemap != nil { - jsonData["ignoreSitemap"] = params.IgnoreSitemap - } - if params.Limit != nil { - jsonData["limit"] = params.Limit - } - } - - resp, err := app.makeRequest( - http.MethodPost, - fmt.Sprintf("%s/v1/map", app.APIURL), - jsonData, - headers, - "map", - ) - if err != nil { - return nil, err - } - - var mapResponse MapResponse - err = json.Unmarshal(resp, &mapResponse) - if err != nil { - return nil, err - } - - if mapResponse.Success { - return &mapResponse, nil - } else { - return nil, fmt.Errorf("map operation failed: %s", mapResponse.Error) - } -} - -// SearchURL searches for a URL using the Firecrawl API. -// -// Parameters: -// - url: The URL to search for. -// - params: Optional parameters for the search request. -// - error: An error if the search request fails. -// -// Search is not implemented in API version 1.0.0. -func (app *FirecrawlApp) Search(query string, params *any) (any, error) { - return nil, fmt.Errorf("Search is not implemented in API version 1.0.0") -} - -// prepareHeaders prepares the headers for an HTTP request. -// -// Parameters: -// - idempotencyKey: A string representing the idempotency key to be included in the headers. -// If the idempotency key is an empty string, it will not be included in the headers. -// -// Returns: -// - map[string]string: A map containing the headers for the HTTP request. -func (app *FirecrawlApp) prepareHeaders(idempotencyKey *string) map[string]string { - headers := map[string]string{ - "Content-Type": "application/json", - "Authorization": fmt.Sprintf("Bearer %s", app.APIKey), - } - if idempotencyKey != nil { - headers["x-idempotency-key"] = *idempotencyKey - } - return headers -} - -// makeRequest makes a request to the specified URL with the provided method, data, headers, and options. -// -// Parameters: -// - method: The HTTP method to use for the request (e.g., "GET", "POST", "DELETE"). -// - url: The URL to send the request to. -// - data: The data to be sent in the request body. -// - headers: The headers to be included in the request. -// - action: A string describing the action being performed. -// - opts: Optional request options. -// -// Returns: -// - []byte: The response body from the request. -// - error: An error if the request fails. -func (app *FirecrawlApp) makeRequest(method, url string, data map[string]any, headers map[string]string, action string, opts ...requestOption) ([]byte, error) { - var body []byte - var err error - if data != nil { - body, err = json.Marshal(data) - if err != nil { - return nil, err - } - } - - req, err := http.NewRequest(method, url, bytes.NewBuffer(body)) - if err != nil { - return nil, err - } - - for key, value := range headers { - req.Header.Set(key, value) - } - - var resp *http.Response - options := newRequestOptions(opts...) - for i := 0; i < options.retries; i++ { - resp, err = app.Client.Do(req) - if err != nil { - return nil, err - } - defer resp.Body.Close() - - if resp.StatusCode != 502 { - break - } - - time.Sleep(time.Duration(math.Pow(2, float64(i))) * time.Duration(options.backoff) * time.Millisecond) - } - - respBody, err := io.ReadAll(resp.Body) - if err != nil { - return nil, err - } - - statusCode := resp.StatusCode - if statusCode != 200 { - return nil, app.handleError(statusCode, respBody, action) - } - - return respBody, nil -} - -// monitorJobStatus monitors the status of a crawl job using the Firecrawl API. -// -// Parameters: -// - ID: The ID of the crawl job to monitor. -// - headers: The headers to be included in the request. -// - pollInterval: The interval (in seconds) at which to poll the job status. -// -// Returns: -// - *CrawlStatusResponse: The crawl result if the job is completed. -// - error: An error if the crawl status check request fails. -func (app *FirecrawlApp) monitorJobStatus(ID string, headers map[string]string, pollInterval int) (*CrawlStatusResponse, error) { - attempts := 3 - - for { - resp, err := app.makeRequest( - http.MethodGet, - fmt.Sprintf("%s/v1/crawl/%s", app.APIURL, ID), - nil, - headers, - "check crawl status", - withRetries(3), - withBackoff(500), - ) - if err != nil { - return nil, err - } - - var statusData CrawlStatusResponse - err = json.Unmarshal(resp, &statusData) - if err != nil { - return nil, err - } - - status := statusData.Status - if status == "" { - return nil, fmt.Errorf("invalid status in response") - } - if status == "completed" { - if statusData.Data != nil { - allData := statusData.Data - for statusData.Next != nil { - resp, err := app.makeRequest( - http.MethodGet, - *statusData.Next, - nil, - headers, - "fetch next page of crawl status", - withRetries(3), - withBackoff(500), - ) - if err != nil { - return nil, err - } - - err = json.Unmarshal(resp, &statusData) - if err != nil { - return nil, err - } - - if statusData.Data != nil { - allData = append(allData, statusData.Data...) - } - } - statusData.Data = allData - return &statusData, nil - } else { - attempts++ - if attempts > 3 { - return nil, fmt.Errorf("crawl job completed but no data was returned") - } - } - } else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" || status == "scraping" { - pollInterval = max(pollInterval, 2) - time.Sleep(time.Duration(pollInterval) * time.Second) - } else { - return nil, fmt.Errorf("crawl job failed or was stopped. Status: %s", status) - } - } -} - -// handleError handles errors returned by the Firecrawl API. -// -// Parameters: -// - resp: The HTTP response object. -// - body: The response body from the HTTP response. -// - action: A string describing the action being performed. -// -// Returns: -// - error: An error describing the failure reason. -func (app *FirecrawlApp) handleError(statusCode int, body []byte, action string) error { - var errorData map[string]any - err := json.Unmarshal(body, &errorData) - if err != nil { - return fmt.Errorf("failed to parse error response: %v", err) - } - - errorMessage, _ := errorData["error"].(string) - if errorMessage == "" { - errorMessage = "No additional error details provided." - } - - var message string - switch statusCode { - case 402: - message = fmt.Sprintf("Payment Required: Failed to %s. %s", action, errorMessage) - case 408: - message = fmt.Sprintf("Request Timeout: Failed to %s as the request timed out. %s", action, errorMessage) - case 409: - message = fmt.Sprintf("Conflict: Failed to %s due to a conflict. %s", action, errorMessage) - case 500: - message = fmt.Sprintf("Internal Server Error: Failed to %s. %s", action, errorMessage) - default: - message = fmt.Sprintf("Unexpected error during %s: Status code %d. %s", action, statusCode, errorMessage) - } - - return fmt.Errorf(message) -} diff --git a/firecrawl_test.go b/firecrawl_test.go index d012bf8..8f8485b 100644 --- a/firecrawl_test.go +++ b/firecrawl_test.go @@ -1,6 +1,9 @@ +//go:build integration + package firecrawl import ( + "context" "log" "os" "testing" @@ -15,17 +18,15 @@ import ( var API_URL string var TEST_API_KEY string -func ptr[T any](v T) *T { - return &v -} - -func init() { +func TestMain(m *testing.M) { err := godotenv.Load(".env") if err != nil { - log.Fatalf("Error loading .env file: %v", err) + log.Printf("Warning: could not load .env file: %v — skipping integration tests", err) + os.Exit(0) } API_URL = os.Getenv("API_URL") TEST_API_KEY = os.Getenv("TEST_API_KEY") + os.Exit(m.Run()) } func TestNoAPIKey(t *testing.T) { @@ -38,7 +39,7 @@ func TestScrapeURLInvalidAPIKey(t *testing.T) { app, err := NewFirecrawlApp("invalid_api_key", API_URL) require.NoError(t, err) - _, err = app.ScrapeURL("https://firecrawl.dev", nil) + _, err = app.ScrapeURL(context.Background(), "https://firecrawl.dev", nil) assert.Error(t, err) assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token") } @@ -47,7 +48,7 @@ func TestBlocklistedURL(t *testing.T) { app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) - _, err = app.ScrapeURL("https://facebook.com/fake-test", nil) + _, err = app.ScrapeURL(context.Background(), "https://facebook.com/fake-test", nil) assert.Error(t, err) assert.Contains(t, err.Error(), "Status code 403") } @@ -56,7 +57,7 @@ func TestScrapeURLE2E(t *testing.T) { app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) - response, err := app.ScrapeURL("https://www.scrapethissite.com", nil) + response, err := app.ScrapeURL(context.Background(), "https://www.scrapethissite.com", nil) require.NoError(t, err) assert.NotNil(t, response) @@ -80,7 +81,7 @@ func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTML(t *testing.T) { WaitFor: ptr(1000), } - response, err := app.ScrapeURL("https://www.scrapethissite.com", ¶ms) + response, err := app.ScrapeURL(context.Background(), "https://www.scrapethissite.com", ¶ms) require.NoError(t, err) assert.NotNil(t, response) @@ -98,7 +99,7 @@ func TestSuccessfulResponseForValidScrapeWithPDFFile(t *testing.T) { app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) - response, err := app.ScrapeURL("https://arxiv.org/pdf/astro-ph/9301001.pdf", nil) + response, err := app.ScrapeURL(context.Background(), "https://arxiv.org/pdf/astro-ph/9301001.pdf", nil) require.NoError(t, err) assert.NotNil(t, response) @@ -110,7 +111,7 @@ func TestSuccessfulResponseForValidScrapeWithPDFFileWithoutExplicitExtension(t * app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) - response, err := app.ScrapeURL("https://arxiv.org/pdf/astro-ph/9301001", nil) + response, err := app.ScrapeURL(context.Background(), "https://arxiv.org/pdf/astro-ph/9301001", nil) require.NoError(t, err) assert.NotNil(t, response) @@ -122,7 +123,7 @@ func TestCrawlURLInvalidAPIKey(t *testing.T) { app, err := NewFirecrawlApp("invalid_api_key", API_URL) require.NoError(t, err) - _, err = app.CrawlURL("https://firecrawl.dev", nil, nil) + _, err = app.CrawlURL(context.Background(), "https://firecrawl.dev", nil, nil) assert.Error(t, err) assert.Contains(t, err.Error(), "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token") } @@ -131,7 +132,7 @@ func TestShouldReturnErrorForBlocklistedURL(t *testing.T) { app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) - _, err = app.CrawlURL("https://twitter.com/fake-test", nil, nil) + _, err = app.CrawlURL(context.Background(), "https://twitter.com/fake-test", nil, nil) assert.Error(t, err) assert.Contains(t, err.Error(), "Status code 403") } @@ -140,7 +141,7 @@ func TestCrawlURLE2E(t *testing.T) { app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) - response, err := app.CrawlURL("https://www.scrapethissite.com", nil, nil) + response, err := app.CrawlURL(context.Background(), "https://www.scrapethissite.com", nil, nil) require.NoError(t, err) assert.NotNil(t, response) @@ -162,14 +163,14 @@ func TestCrawlURLWithOptionsE2E(t *testing.T) { app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) - response, err := app.CrawlURL("https://www.scrapethissite.com", + response, err := app.CrawlURL(context.Background(), "https://www.scrapethissite.com", &CrawlParams{ - ExcludePaths: []string{"blog/*"}, - IncludePaths: []string{"/"}, - MaxDepth: ptr(2), - IgnoreSitemap: ptr(true), - Limit: ptr(10), - AllowBackwardLinks: ptr(true), + ExcludePaths: []string{"blog/*"}, + IncludePaths: []string{"/"}, + MaxDiscoveryDepth: ptr(2), + Sitemap: ptr("skip"), + Limit: ptr(10), + CrawlEntireDomain: ptr(true), AllowExternalLinks: ptr(true), ScrapeOptions: ScrapeParams{ Formats: []string{"markdown", "html", "rawHtml", "screenshot", "links"}, @@ -219,7 +220,7 @@ func TestCrawlURLWithIdempotencyKeyE2E(t *testing.T) { ExcludePaths: []string{"blog/*"}, Limit: ptr(10), } - response, err := app.CrawlURL("https://www.scrapethissite.com", params, &uniqueIdempotencyKey) + response, err := app.CrawlURL(context.Background(), "https://www.scrapethissite.com", params, &uniqueIdempotencyKey) require.NoError(t, err) assert.NotNil(t, response) @@ -228,7 +229,7 @@ func TestCrawlURLWithIdempotencyKeyE2E(t *testing.T) { require.IsType(t, []*FirecrawlDocument{}, data) assert.Contains(t, data[0].Markdown, "# Scrape This Site") - _, err = app.CrawlURL("https://firecrawl.dev", params, &uniqueIdempotencyKey) + _, err = app.CrawlURL(context.Background(), "https://firecrawl.dev", params, &uniqueIdempotencyKey) assert.Error(t, err) assert.Contains(t, err.Error(), "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used") } @@ -237,7 +238,7 @@ func TestAsyncCrawlURLE2E(t *testing.T) { app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) - response, err := app.AsyncCrawlURL("https://www.scrapethissite.com", nil, nil) + response, err := app.AsyncCrawlURL(context.Background(), "https://www.scrapethissite.com", nil, nil) require.NoError(t, err) assert.NotNil(t, response) @@ -250,14 +251,14 @@ func TestAsyncCrawlURLWithOptionsE2E(t *testing.T) { app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) - response, err := app.AsyncCrawlURL("https://www.scrapethissite.com", + response, err := app.AsyncCrawlURL(context.Background(), "https://www.scrapethissite.com", &CrawlParams{ - ExcludePaths: []string{"blog/*"}, - IncludePaths: []string{"/"}, - MaxDepth: ptr(2), - IgnoreSitemap: ptr(true), - Limit: ptr(10), - AllowBackwardLinks: ptr(true), + ExcludePaths: []string{"blog/*"}, + IncludePaths: []string{"/"}, + MaxDiscoveryDepth: ptr(2), + Sitemap: ptr("skip"), + Limit: ptr(10), + CrawlEntireDomain: ptr(true), AllowExternalLinks: ptr(true), ScrapeOptions: ScrapeParams{ Formats: []string{"markdown", "html", "rawHtml", "screenshot", "links"}, @@ -286,14 +287,14 @@ func TestAsyncCrawlURLWithIdempotencyKeyE2E(t *testing.T) { params := &CrawlParams{ ExcludePaths: []string{"blog/*"}, } - response, err := app.AsyncCrawlURL("https://www.scrapethissite.com", params, &uniqueIdempotencyKey) + response, err := app.AsyncCrawlURL(context.Background(), "https://www.scrapethissite.com", params, &uniqueIdempotencyKey) require.NoError(t, err) assert.NotNil(t, response) assert.NotNil(t, response.ID) assert.NotNil(t, response.URL) assert.True(t, response.Success) - _, err = app.AsyncCrawlURL("https://firecrawl.dev", params, &uniqueIdempotencyKey) + _, err = app.AsyncCrawlURL(context.Background(), "https://firecrawl.dev", params, &uniqueIdempotencyKey) assert.Error(t, err) assert.Contains(t, err.Error(), "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used") } @@ -307,7 +308,7 @@ func TestCheckCrawlStatusE2E(t *testing.T) { Formats: []string{"markdown", "html", "rawHtml", "screenshot", "links"}, }, } - asyncCrawlResponse, err := app.AsyncCrawlURL("https://firecrawl.dev", params, nil) + asyncCrawlResponse, err := app.AsyncCrawlURL(context.Background(), "https://firecrawl.dev", params, nil) require.NoError(t, err) assert.NotNil(t, asyncCrawlResponse) @@ -321,15 +322,15 @@ func TestCheckCrawlStatusE2E(t *testing.T) { time.Sleep(5 * time.Second) // wait for 5 seconds - response, err := app.CheckCrawlStatus(asyncCrawlResponse.ID) - require.NoError(t, err) - assert.NotNil(t, response) + statusResponse, statusErr := app.CheckCrawlStatus(context.Background(), asyncCrawlResponse.ID) + require.NoError(t, statusErr) + assert.NotNil(t, statusResponse) - assert.GreaterOrEqual(t, len(response.Data), 0) - assert.GreaterOrEqual(t, response.Total, 0) - assert.GreaterOrEqual(t, response.CreditsUsed, 0) + assert.GreaterOrEqual(t, len(statusResponse.Data), 0) + assert.GreaterOrEqual(t, statusResponse.Total, 0) + assert.GreaterOrEqual(t, statusResponse.CreditsUsed, 0) - if response.Status == "completed" { + if statusResponse.Status == "completed" { break } @@ -337,7 +338,7 @@ func TestCheckCrawlStatusE2E(t *testing.T) { } // Final check after loop or if completed - response, err := app.CheckCrawlStatus(asyncCrawlResponse.ID) + response, err := app.CheckCrawlStatus(context.Background(), asyncCrawlResponse.ID) require.NoError(t, err) assert.NotNil(t, response) @@ -363,7 +364,7 @@ func TestCheckCrawlStatusE2E(t *testing.T) { func TestMapURLInvalidAPIKey(t *testing.T) { invalidApp, err := NewFirecrawlApp("invalid_api_key", API_URL) require.NoError(t, err) - _, err = invalidApp.MapURL("https://www.scrapethissite.com", nil) + _, err = invalidApp.MapURL(context.Background(), "https://www.scrapethissite.com", nil) require.Error(t, err) assert.Contains(t, err.Error(), "Status code 401") } @@ -372,7 +373,7 @@ func TestMapURLBlocklistedURL(t *testing.T) { app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) blocklistedUrl := "https://facebook.com/fake-test" - _, err = app.MapURL(blocklistedUrl, nil) + _, err = app.MapURL(context.Background(), blocklistedUrl, nil) require.Error(t, err) assert.Contains(t, err.Error(), "Status code 403") } @@ -381,22 +382,26 @@ func TestMapURLValidMap(t *testing.T) { app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) - response, err := app.MapURL("https://www.scrapethissite.com", nil) + response, err := app.MapURL(context.Background(), "https://www.scrapethissite.com", nil) require.NoError(t, err) assert.NotNil(t, response) assert.IsType(t, &MapResponse{}, response) assert.Greater(t, len(response.Links), 0) - assert.Contains(t, response.Links[0], "https://") - assert.Contains(t, response.Links[0], "scrapethissite.com") + assert.Contains(t, response.Links[0].URL, "https://") + assert.Contains(t, response.Links[0].URL, "scrapethissite.com") } -func TestMapURLWithSearchParameter(t *testing.T) { +func TestMapURLWithSearchParameterE2E(t *testing.T) { app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) require.NoError(t, err) - _, err = app.Search("https://www.scrapethissite.com", nil) - assert.Error(t, err) - assert.Contains(t, err.Error(), "Search is not implemented in API version 1.0.0") + response, err := app.MapURL(context.Background(), "https://www.scrapethissite.com", &MapParams{ + Search: ptr("hockey"), + Limit: ptr(5), + }) + require.NoError(t, err) + assert.NotNil(t, response) + assert.True(t, response.Success) } func TestScrapeURLWithMaxAge(t *testing.T) { @@ -409,7 +414,7 @@ func TestScrapeURLWithMaxAge(t *testing.T) { MaxAge: ptr(3600000), // 1 hour in milliseconds } - response, err := app.ScrapeURL("https://roastmywebsite.ai", params) + response, err := app.ScrapeURL(context.Background(), "https://roastmywebsite.ai", params) require.NoError(t, err) assert.NotNil(t, response) @@ -428,7 +433,7 @@ func TestScrapeURLWithMaxAgeZero(t *testing.T) { MaxAge: ptr(0), // Disable caching } - response, err := app.ScrapeURL("https://roastmywebsite.ai", params) + response, err := app.ScrapeURL(context.Background(), "https://roastmywebsite.ai", params) require.NoError(t, err) assert.NotNil(t, response) @@ -450,7 +455,7 @@ func TestCrawlURLWithMaxAge(t *testing.T) { Limit: ptr(5), // Limit to 5 pages for faster test } - response, err := app.CrawlURL("https://roastmywebsite.ai", params, nil) + response, err := app.CrawlURL(context.Background(), "https://roastmywebsite.ai", params, nil) require.NoError(t, err) assert.NotNil(t, response) @@ -495,7 +500,7 @@ func TestScrapeURLWithJsonOptions(t *testing.T) { }, } - response, err := app.ScrapeURL("https://roastmywebsite.ai", params) + response, err := app.ScrapeURL(context.Background(), "https://roastmywebsite.ai", params) require.NoError(t, err) assert.NotNil(t, response) @@ -535,7 +540,7 @@ func TestScrapeURLWithJSONOptions(t *testing.T) { }, } - response, err := app.ScrapeURL("https://roastmywebsite.ai", params) + response, err := app.ScrapeURL(context.Background(), "https://roastmywebsite.ai", params) require.NoError(t, err) assert.NotNil(t, response) // When using jsonOptions, the extracted data is in JSON field @@ -544,3 +549,150 @@ func TestScrapeURLWithJSONOptions(t *testing.T) { // Check that the extracted data contains the expected fields assert.Contains(t, response.JSON, "mission") } + +// --- Map E2E Tests --- + +func TestMapURLWithLinksE2E(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + result, err := app.MapURL(context.Background(), "https://firecrawl.dev", &MapParams{ + Limit: ptr(5), + }) + require.NoError(t, err) + assert.True(t, result.Success) + assert.Greater(t, len(result.Links), 0) + assert.NotEmpty(t, result.Links[0].URL) +} + +// --- Search E2E Tests --- + +func TestSearchE2E(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + result, err := app.Search(context.Background(), "firecrawl web scraping", nil) + require.NoError(t, err) + assert.True(t, result.Success) + assert.Greater(t, len(result.Data.Web), 0) + assert.NotEmpty(t, result.Data.Web[0].URL) + assert.NotEmpty(t, result.Data.Web[0].Title) +} + +func TestSearchWithParamsE2E(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + result, err := app.Search(context.Background(), "firecrawl", &SearchParams{ + Limit: ptr(3), + Country: ptr("US"), + }) + require.NoError(t, err) + assert.True(t, result.Success) + assert.LessOrEqual(t, len(result.Data.Web), 3) +} + +func TestSearchWithScrapeOptionsE2E(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + result, err := app.Search(context.Background(), "firecrawl", &SearchParams{ + Limit: ptr(2), + ScrapeOptions: &ScrapeParams{ + Formats: []string{"markdown"}, + }, + }) + require.NoError(t, err) + assert.True(t, result.Success) +} + +// --- Batch Scrape E2E Tests --- + +func TestAsyncBatchScrapeURLsE2E(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + response, err := app.AsyncBatchScrapeURLs( + context.Background(), + []string{"https://firecrawl.dev"}, + nil, nil, + ) + require.NoError(t, err) + assert.True(t, response.Success) + assert.NotEmpty(t, response.ID) +} + +func TestCheckBatchScrapeStatusE2E(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + // Start a batch job first. + response, err := app.AsyncBatchScrapeURLs( + context.Background(), + []string{"https://firecrawl.dev"}, + nil, nil, + ) + require.NoError(t, err) + + // Check status immediately — it may be scraping or completed. + status, err := app.CheckBatchScrapeStatus(context.Background(), response.ID) + require.NoError(t, err) + assert.NotEmpty(t, status.Status) +} + +// --- Extract E2E Tests --- + +func TestAsyncExtractE2E(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + response, err := app.AsyncExtract( + context.Background(), + []string{"https://firecrawl.dev"}, + &ExtractParams{ + Prompt: ptr("Extract the company name"), + }, + ) + require.NoError(t, err) + assert.True(t, response.Success) + assert.NotEmpty(t, response.ID) +} + +func TestCheckExtractStatusE2E(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + response, err := app.AsyncExtract( + context.Background(), + []string{"https://firecrawl.dev"}, + &ExtractParams{Prompt: ptr("Extract company name")}, + ) + require.NoError(t, err) + + status, err := app.CheckExtractStatus(context.Background(), response.ID) + require.NoError(t, err) + assert.NotEmpty(t, status.Status) +} + +// --- Pagination E2E Tests --- + +func TestCheckCrawlStatusWithPaginationE2E(t *testing.T) { + app, err := NewFirecrawlApp(TEST_API_KEY, API_URL) + require.NoError(t, err) + + // Start a crawl with enough pages to generate pagination. + response, err := app.AsyncCrawlURL(context.Background(), "https://docs.firecrawl.dev", &CrawlParams{ + Limit: ptr(5), + }, nil) + require.NoError(t, err) + + // Wait a bit, then check with pagination. + time.Sleep(10 * time.Second) + + status, err := app.CheckCrawlStatus(context.Background(), response.ID, &PaginationConfig{ + AutoPaginate: ptr(true), + MaxPages: ptr(2), + }) + require.NoError(t, err) + assert.NotEmpty(t, status.Status) +} diff --git a/firecrawl_test.go_V0 b/firecrawl_test.go_V0 deleted file mode 100644 index 925e8eb..0000000 --- a/firecrawl_test.go_V0 +++ /dev/null @@ -1,304 +0,0 @@ -package firecrawl - -import ( - "log" - "os" - "testing" - "time" - - "github.com/google/uuid" - "github.com/joho/godotenv" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -var API_URL_V0 string -var TEST_API_KEY_V0 string - -func init() { - err := godotenv.Load("../.env") - if err != nil { - log.Fatalf("Error loading .env file: %v", err) - } - API_URL_V0 = os.Getenv("API_URL") - TEST_API_KEY_V0 = os.Getenv("TEST_API_KEY") -} - -func TestNoAPIKeyV0(t *testing.T) { - _, err := NewFirecrawlApp("", API_URL_V0, "v0") - assert.Error(t, err) - assert.Contains(t, err.Error(), "no API key provided") -} - -func TestScrapeURLInvalidAPIKeyV0(t *testing.T) { - app, err := NewFirecrawlApp("invalid_api_key", API_URL_V0, "v0") - require.NoError(t, err) - - _, err = app.ScrapeURL("https://firecrawl.dev", nil) - assert.Error(t, err) - assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token") -} - -func TestBlocklistedURLV0(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY_V0, API_URL_V0, "v0") - require.NoError(t, err) - - _, err = app.ScrapeURL("https://facebook.com/fake-test", nil) - assert.Error(t, err) - assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.") -} - -func TestSuccessfulResponseWithValidPreviewTokenV0(t *testing.T) { - app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL_V0, "v0") - require.NoError(t, err) - - response, err := app.ScrapeURL("https://roastmywebsite.ai", nil) - require.NoError(t, err) - assert.NotNil(t, response) - - scrapeResponse := response.(*FirecrawlDocumentV0) - assert.Contains(t, scrapeResponse.Content, "_Roast_") -} - -func TestScrapeURLE2EV0(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY_V0, API_URL_V0, "v0") - require.NoError(t, err) - - response, err := app.ScrapeURL("https://roastmywebsite.ai", nil) - require.NoError(t, err) - assert.NotNil(t, response) - - scrapeResponse := response.(*FirecrawlDocumentV0) - assert.Contains(t, scrapeResponse.Content, "_Roast_") - assert.NotEqual(t, scrapeResponse.Markdown, "") - assert.NotNil(t, scrapeResponse.Metadata) - assert.Equal(t, scrapeResponse.HTML, "") -} - -func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTMLV0(t *testing.T) { - app, err := NewFirecrawlApp(TEST_API_KEY_V0, API_URL_V0, "v0") - require.NoError(t, err) - - params := map[string]any{ - "pageOptions": map[string]any{ - "includeHtml": true, - }, - } - response, err := app.ScrapeURL("https://roastmywebsite.ai", params) - require.NoError(t, err) - assert.NotNil(t, response) - - scrapeResponse := response.(*FirecrawlDocumentV0) - - assert.Contains(t, scrapeResponse.Content, "_Roast_") - assert.Contains(t, scrapeResponse.Markdown, "_Roast_") - assert.Contains(t, scrapeResponse.HTML, " 3 { + return nil, fmt.Errorf("crawl job completed but no data was returned") + } + case "scraping": + pollInterval = max(pollInterval, 2) + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(time.Duration(pollInterval) * time.Second): + } + case "failed": + return nil, fmt.Errorf("crawl job failed. Status: %s", status) + default: + return nil, fmt.Errorf("unknown crawl status: %s", status) + } + } +} diff --git a/helpers_test.go b/helpers_test.go new file mode 100644 index 0000000..e8c9456 --- /dev/null +++ b/helpers_test.go @@ -0,0 +1,277 @@ +package firecrawl + +import ( + "context" + "net/http" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// ---- makeRequest ---- + +func TestMakeRequest_Success(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodGet, r.Method) + assert.Equal(t, "/test", r.URL.Path) + respondJSON(w, http.StatusOK, map[string]string{"status": "ok"}) + }) + + headers := app.prepareHeaders(nil) + resp, err := app.makeRequest( + context.Background(), + http.MethodGet, + app.APIURL+"/test", + nil, + headers, + "test request", + ) + require.NoError(t, err) + assert.Contains(t, string(resp), "ok") +} + +func TestMakeRequest_PostWithBody(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodPost, r.Method) + assert.Equal(t, "application/json", r.Header.Get("Content-Type")) + + var body map[string]any + decodeJSONBody(t, r, &body) + assert.Equal(t, "https://example.com", body["url"]) + + respondJSON(w, http.StatusOK, map[string]string{"result": "success"}) + }) + + headers := app.prepareHeaders(nil) + reqBody := []byte(`{"url":"https://example.com"}`) + resp, err := app.makeRequest( + context.Background(), + http.MethodPost, + app.APIURL+"/test", + reqBody, + headers, + "test post", + ) + require.NoError(t, err) + assert.Contains(t, string(resp), "success") +} + +func TestMakeRequest_RetryOn502(t *testing.T) { + attempts := 0 + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + attempts++ + if attempts < 3 { + w.WriteHeader(http.StatusBadGateway) + return + } + respondJSON(w, http.StatusOK, map[string]string{"status": "ok"}) + }) + + headers := app.prepareHeaders(nil) + resp, err := app.makeRequest( + context.Background(), + http.MethodGet, + app.APIURL+"/test", + nil, + headers, + "test retry", + withRetries(3), + withBackoff(0), // 0ms backoff for fast tests + ) + require.NoError(t, err) + assert.NotNil(t, resp) + assert.Equal(t, 3, attempts) +} + +func TestMakeRequest_NoRetryOn4xx(t *testing.T) { + attempts := 0 + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + attempts++ + respondJSON(w, http.StatusBadRequest, map[string]string{"error": "Bad request"}) + }) + + headers := app.prepareHeaders(nil) + _, err := app.makeRequest( + context.Background(), + http.MethodGet, + app.APIURL+"/test", + nil, + headers, + "test no retry", + withRetries(3), + withBackoff(0), + ) + // Should fail immediately, not retry + assert.Error(t, err) + assert.Equal(t, 1, attempts, "4xx errors should not be retried") +} + +func TestMakeRequest_ContextCancelled(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() // Cancel immediately + + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made with cancelled context") + }) + + headers := app.prepareHeaders(nil) + _, err := app.makeRequest( + ctx, + http.MethodGet, + app.APIURL+"/test", + nil, + headers, + "test cancelled", + ) + assert.Error(t, err) + assert.ErrorIs(t, err, context.Canceled) +} + +func TestMakeRequest_NonJSONErrorBody(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + _, _ = w.Write([]byte("Internal Server Error\n")) + }) + + headers := app.prepareHeaders(nil) + _, err := app.makeRequest( + context.Background(), + http.MethodGet, + app.APIURL+"/test", + nil, + headers, + "test non-json error", + ) + assert.Error(t, err) + // Should still produce an error with status code info + var apiErr *APIError + assert.ErrorAs(t, err, &apiErr) + assert.Equal(t, 500, apiErr.StatusCode) +} + +func TestMakeRequest_AuthorizationHeader(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, "Bearer fc-test-key", r.Header.Get("Authorization")) + respondJSON(w, http.StatusOK, map[string]string{"ok": "true"}) + }) + + headers := app.prepareHeaders(nil) + _, err := app.makeRequest( + context.Background(), + http.MethodGet, + app.APIURL+"/test", + nil, + headers, + "test auth header", + ) + require.NoError(t, err) +} + +// ---- monitorJobStatus ---- + +func TestMonitorJobStatus_CompletedImmediately(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodGet, r.Method) + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 3, + Completed: 3, + Data: []*FirecrawlDocument{{Markdown: "# Doc 1"}, {Markdown: "# Doc 2"}, {Markdown: "# Doc 3"}}, + }) + }) + + headers := app.prepareHeaders(nil) + result, err := app.monitorJobStatus(context.Background(), validCrawlID, headers, 0) + require.NoError(t, err) + assert.Equal(t, "completed", result.Status) + assert.Len(t, result.Data, 3) +} + +func TestMonitorJobStatus_Failed(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, CrawlStatusResponse{Status: "failed"}) + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorJobStatus(context.Background(), validCrawlID, headers, 0) + assert.Error(t, err) + assert.Contains(t, err.Error(), "failed") +} + +func TestMonitorJobStatus_UnknownStatus(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, CrawlStatusResponse{Status: "unknown_status"}) + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorJobStatus(context.Background(), validCrawlID, headers, 0) + assert.Error(t, err) + assert.Contains(t, err.Error(), "unknown crawl status") +} + +func TestMonitorJobStatus_EmptyStatus(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, CrawlStatusResponse{Status: ""}) + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorJobStatus(context.Background(), validCrawlID, headers, 0) + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid status") +} + +func TestMonitorJobStatus_ContextCancelledBeforeRequest(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() // Cancel before any request + + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made with cancelled context") + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorJobStatus(ctx, validCrawlID, headers, 0) + assert.Error(t, err) + assert.ErrorIs(t, err, context.Canceled) +} + +func TestMonitorJobStatus_CompletedNoData(t *testing.T) { + // When status is "completed" but Data is nil, it retries up to 3 times then errors. + requestCount := 0 + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + requestCount++ + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 1, + Completed: 1, + Data: nil, // No data + }) + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorJobStatus(context.Background(), validCrawlID, headers, 0) + assert.Error(t, err) + assert.Contains(t, err.Error(), "no data was returned") + // Should have retried 3+ times before giving up + assert.GreaterOrEqual(t, requestCount, 3) +} + +func TestMonitorJobStatus_PaginationUnsafeURL(t *testing.T) { + // Verify that monitorJobStatus rejects pagination Next URLs pointing to a different host (SSRF prevention). + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + // Return a completed status with a Next URL pointing to a different (attacker-controlled) host + next := "https://attacker.example.com/steal-token?cursor=2" + respondJSON(w, http.StatusOK, CrawlStatusResponse{ + Status: "completed", + Total: 2, + Completed: 2, + Data: []*FirecrawlDocument{{Markdown: "# Page 1"}}, + Next: &next, + }) + }) + + headers := app.prepareHeaders(nil) + _, err := app.monitorJobStatus(context.Background(), validCrawlID, headers, 0) + assert.Error(t, err) + assert.Contains(t, err.Error(), "unsafe pagination URL") +} diff --git a/map.go b/map.go new file mode 100644 index 0000000..2293070 --- /dev/null +++ b/map.go @@ -0,0 +1,75 @@ +package firecrawl + +import ( + "context" + "encoding/json" + "fmt" + "net/http" +) + +// mapRequest is the internal request body for the v2 /map endpoint. +type mapRequest struct { + URL string `json:"url"` + IncludeSubdomains *bool `json:"includeSubdomains,omitempty"` + Search *string `json:"search,omitempty"` + Limit *int `json:"limit,omitempty"` + Sitemap *string `json:"sitemap,omitempty"` + IgnoreQueryParameters *bool `json:"ignoreQueryParameters,omitempty"` + IgnoreCache *bool `json:"ignoreCache,omitempty"` + Timeout *int `json:"timeout,omitempty"` + Location *LocationConfig `json:"location,omitempty"` +} + +// MapURL initiates a mapping operation for a URL using the Firecrawl API. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - url: The URL to map. +// - params: Optional parameters for the mapping request. +// +// Returns: +// - *MapResponse: The response from the mapping operation, with Links as []MapLink. +// - error: An error if the mapping request fails. +func (app *FirecrawlApp) MapURL(ctx context.Context, url string, params *MapParams) (*MapResponse, error) { + headers := app.prepareHeaders(nil) + + req := mapRequest{URL: url} + if params != nil { + req.IncludeSubdomains = params.IncludeSubdomains + req.Search = params.Search + req.Limit = params.Limit + req.Sitemap = params.Sitemap + req.IgnoreQueryParameters = params.IgnoreQueryParameters + req.IgnoreCache = params.IgnoreCache + req.Timeout = params.Timeout + req.Location = params.Location + } + + body, err := json.Marshal(req) + if err != nil { + return nil, fmt.Errorf("failed to marshal map request: %w", err) + } + + resp, err := app.makeRequest( + ctx, + http.MethodPost, + fmt.Sprintf("%s/v2/map", app.APIURL), + body, + headers, + "map", + ) + if err != nil { + return nil, err + } + + var mapResponse MapResponse + err = json.Unmarshal(resp, &mapResponse) + if err != nil { + return nil, err + } + + if mapResponse.Success { + return &mapResponse, nil + } + return nil, fmt.Errorf("map operation failed: %s", mapResponse.Error) +} diff --git a/map_test.go b/map_test.go new file mode 100644 index 0000000..248483f --- /dev/null +++ b/map_test.go @@ -0,0 +1,138 @@ +package firecrawl + +import ( + "context" + "net/http" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestMapURL_Success(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodPost, r.Method) + assert.Equal(t, "/v2/map", r.URL.Path) + assert.Equal(t, "Bearer fc-test-key", r.Header.Get("Authorization")) + + var body map[string]any + decodeJSONBody(t, r, &body) + assert.Equal(t, "https://example.com", body["url"]) + + respondJSON(w, http.StatusOK, MapResponse{ + Success: true, + Links: []MapLink{ + {URL: "https://example.com/page1", Title: ptr("Page 1")}, + {URL: "https://example.com/page2", Title: ptr("Page 2")}, + }, + }) + }) + + result, err := app.MapURL(context.Background(), "https://example.com", nil) + require.NoError(t, err) + assert.Len(t, result.Links, 2) + assert.Equal(t, "https://example.com/page1", result.Links[0].URL) + assert.Equal(t, "Page 1", *result.Links[0].Title) +} + +func TestMapURL_AllParams(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + var body map[string]any + decodeJSONBody(t, r, &body) + + assert.Equal(t, "https://example.com", body["url"]) + assert.Equal(t, true, body["includeSubdomains"]) + assert.Equal(t, "blog", body["search"]) + assert.NotNil(t, body["limit"]) + assert.Equal(t, "include", body["sitemap"]) + assert.Equal(t, true, body["ignoreQueryParameters"]) + assert.Equal(t, true, body["ignoreCache"]) + assert.NotNil(t, body["timeout"]) + assert.NotNil(t, body["location"]) + + respondJSON(w, http.StatusOK, MapResponse{ + Success: true, + Links: []MapLink{{URL: "https://example.com/blog/post-1"}}, + }) + }) + + result, err := app.MapURL(context.Background(), "https://example.com", &MapParams{ + IncludeSubdomains: ptr(true), + Search: ptr("blog"), + Limit: ptr(1000), + Sitemap: ptr("include"), + IgnoreQueryParameters: ptr(true), + IgnoreCache: ptr(true), + Timeout: ptr(30000), + Location: &LocationConfig{Country: "US", Languages: []string{"en"}}, + }) + require.NoError(t, err) + assert.Len(t, result.Links, 1) +} + +func TestMapURL_NilParams(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + var body map[string]any + decodeJSONBody(t, r, &body) + // Only url should be present, no optional params + assert.Equal(t, "https://example.com", body["url"]) + assert.Nil(t, body["includeSubdomains"]) + assert.Nil(t, body["search"]) + + respondJSON(w, http.StatusOK, MapResponse{ + Success: true, + Links: []MapLink{{URL: "https://example.com"}}, + }) + }) + + result, err := app.MapURL(context.Background(), "https://example.com", nil) + require.NoError(t, err) + assert.NotNil(t, result) +} + +func TestMapURL_EmptyLinks(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, MapResponse{ + Success: true, + Links: []MapLink{}, + }) + }) + + result, err := app.MapURL(context.Background(), "https://example.com", nil) + require.NoError(t, err) + assert.NotNil(t, result) + assert.Empty(t, result.Links) +} + +func TestMapURL_FailedResponse(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, MapResponse{ + Success: false, + Error: "map operation failed: site not reachable", + }) + }) + + _, err := app.MapURL(context.Background(), "https://example.com", nil) + assert.Error(t, err) + assert.Contains(t, err.Error(), "map operation failed") +} + +func TestMapURL_Unauthorized(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusUnauthorized, map[string]string{"error": "Invalid token"}) + }) + + _, err := app.MapURL(context.Background(), "https://example.com", nil) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrUnauthorized) +} + +func TestMapURL_ServerError(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusInternalServerError, map[string]string{"error": "Internal failure"}) + }) + + _, err := app.MapURL(context.Background(), "https://example.com", nil) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrServerError) +} diff --git a/options.go b/options.go new file mode 100644 index 0000000..d3321ec --- /dev/null +++ b/options.go @@ -0,0 +1,51 @@ +package firecrawl + +// requestOptions represents options for making requests. +type requestOptions struct { + retries int + backoff int +} + +// requestOption is a functional option type for requestOptions. +type requestOption func(*requestOptions) + +// newRequestOptions creates a new requestOptions instance with the provided options. +// +// Parameters: +// - opts: Optional request options. +// +// Returns: +// - *requestOptions: A new instance of requestOptions with the provided options. +func newRequestOptions(opts ...requestOption) *requestOptions { + options := &requestOptions{retries: 1} + for _, opt := range opts { + opt(options) + } + return options +} + +// withRetries sets the number of retries for a request. +// +// Parameters: +// - retries: The number of retries to be performed. +// +// Returns: +// - requestOption: A functional option that sets the number of retries for a request. +func withRetries(retries int) requestOption { + return func(opts *requestOptions) { + opts.retries = retries + } +} + +// withBackoff sets the backoff interval for a request. +// +// Parameters: +// - backoff: The backoff interval (in milliseconds) to be used for retries. +// +// Returns: +// - requestOption: A functional option that sets the backoff interval for a request. +func withBackoff(backoff int) requestOption { + return func(opts *requestOptions) { + opts.backoff = backoff + } +} diff --git a/scrape.go b/scrape.go new file mode 100644 index 0000000..ce8c96b --- /dev/null +++ b/scrape.go @@ -0,0 +1,100 @@ +package firecrawl + +import ( + "context" + "encoding/json" + "fmt" + "net/http" +) + +// scrapeRequest is the internal request struct for scrape operations. +// It is unexported — callers use ScrapeParams instead. +type scrapeRequest struct { + URL string `json:"url"` + Formats []string `json:"formats,omitempty"` + Headers *map[string]string `json:"headers,omitempty"` + IncludeTags []string `json:"includeTags,omitempty"` + ExcludeTags []string `json:"excludeTags,omitempty"` + OnlyMainContent *bool `json:"onlyMainContent,omitempty"` + WaitFor *int `json:"waitFor,omitempty"` + Timeout *int `json:"timeout,omitempty"` + MaxAge *int `json:"maxAge,omitempty"` + MinAge *int `json:"minAge,omitempty"` + JsonOptions *JsonOptions `json:"jsonOptions,omitempty"` + Mobile *bool `json:"mobile,omitempty"` + SkipTlsVerification *bool `json:"skipTlsVerification,omitempty"` + BlockAds *bool `json:"blockAds,omitempty"` + Proxy *string `json:"proxy,omitempty"` + Location *LocationConfig `json:"location,omitempty"` + Parsers []ParserConfig `json:"parsers,omitempty"` + Actions []ActionConfig `json:"actions,omitempty"` + RemoveBase64Images *bool `json:"removeBase64Images,omitempty"` + StoreInCache *bool `json:"storeInCache,omitempty"` + ZeroDataRetention *bool `json:"zeroDataRetention,omitempty"` +} + +// ScrapeURL scrapes the content of the specified URL using the Firecrawl API. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - url: The URL to be scraped. +// - params: Optional parameters for the scrape request, including formats, actions, location, and LLM extraction options. +// +// Returns: +// - *FirecrawlDocument: The scraped document data. +// - error: An error if the scrape request fails. +func (app *FirecrawlApp) ScrapeURL(ctx context.Context, url string, params *ScrapeParams) (*FirecrawlDocument, error) { + headers := app.prepareHeaders(nil) + + req := scrapeRequest{URL: url} + if params != nil { + req.Formats = params.Formats + req.Headers = params.Headers + req.IncludeTags = params.IncludeTags + req.ExcludeTags = params.ExcludeTags + req.OnlyMainContent = params.OnlyMainContent + req.WaitFor = params.WaitFor + req.Timeout = params.Timeout + req.MaxAge = params.MaxAge + req.MinAge = params.MinAge + req.JsonOptions = params.JsonOptions + req.Mobile = params.Mobile + req.SkipTlsVerification = params.SkipTlsVerification + req.BlockAds = params.BlockAds + req.Proxy = params.Proxy + req.Location = params.Location + req.Parsers = params.Parsers + req.Actions = params.Actions + req.RemoveBase64Images = params.RemoveBase64Images + req.StoreInCache = params.StoreInCache + req.ZeroDataRetention = params.ZeroDataRetention + } + + body, err := json.Marshal(req) + if err != nil { + return nil, fmt.Errorf("failed to marshal scrape request: %w", err) + } + + resp, err := app.makeRequest( + ctx, + http.MethodPost, + fmt.Sprintf("%s/v2/scrape", app.APIURL), + body, + headers, + "scrape URL", + ) + if err != nil { + return nil, err + } + + var scrapeResponse ScrapeResponse + if err := json.Unmarshal(resp, &scrapeResponse); err != nil { + return nil, fmt.Errorf("failed to parse scrape response: %w", err) + } + + if !scrapeResponse.Success { + return nil, fmt.Errorf("failed to scrape URL") + } + + return scrapeResponse.Data, nil +} diff --git a/scrape_test.go b/scrape_test.go new file mode 100644 index 0000000..506dfc6 --- /dev/null +++ b/scrape_test.go @@ -0,0 +1,183 @@ +package firecrawl + +import ( + "context" + "net/http" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestScrapeURL_Success(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodPost, r.Method) + assert.Equal(t, "/v2/scrape", r.URL.Path) + assert.Equal(t, "Bearer fc-test-key", r.Header.Get("Authorization")) + + var body map[string]any + decodeJSONBody(t, r, &body) + assert.Equal(t, "https://example.com", body["url"]) + + respondJSON(w, http.StatusOK, ScrapeResponse{ + Success: true, + Data: &FirecrawlDocument{Markdown: "# Hello"}, + }) + }) + + result, err := app.ScrapeURL(context.Background(), "https://example.com", nil) + require.NoError(t, err) + assert.Equal(t, "# Hello", result.Markdown) +} + +func TestScrapeURL_WithParams(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + var body map[string]any + decodeJSONBody(t, r, &body) + assert.Equal(t, "https://example.com", body["url"]) + + respondJSON(w, http.StatusOK, ScrapeResponse{ + Success: true, + Data: &FirecrawlDocument{Markdown: "# Hello", HTML: "

Hello

"}, + }) + }) + + params := &ScrapeParams{ + Formats: []string{"markdown", "html"}, + OnlyMainContent: ptr(true), + WaitFor: ptr(1000), + } + result, err := app.ScrapeURL(context.Background(), "https://example.com", params) + require.NoError(t, err) + assert.Equal(t, "# Hello", result.Markdown) + assert.Equal(t, "

Hello

", result.HTML) +} + +func TestScrapeURL_Unauthorized(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusUnauthorized, map[string]string{"error": "Invalid token"}) + }) + + _, err := app.ScrapeURL(context.Background(), "https://example.com", nil) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrUnauthorized) +} + +func TestScrapeURL_AllParams(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + var body map[string]any + decodeJSONBody(t, r, &body) + + assert.Equal(t, "https://example.com", body["url"]) + assert.Contains(t, body["formats"], "markdown") + assert.Contains(t, body["formats"], "html") + assert.Equal(t, true, body["onlyMainContent"]) + assert.Equal(t, true, body["mobile"]) + assert.NotNil(t, body["waitFor"]) + assert.NotNil(t, body["timeout"]) + assert.NotNil(t, body["location"]) + assert.NotNil(t, body["actions"]) + + respondJSON(w, http.StatusOK, ScrapeResponse{ + Success: true, + Data: &FirecrawlDocument{Markdown: "# Test"}, + }) + }) + + result, err := app.ScrapeURL(context.Background(), "https://example.com", &ScrapeParams{ + Formats: []string{"markdown", "html"}, + OnlyMainContent: ptr(true), + Mobile: ptr(true), + WaitFor: ptr(1000), + Timeout: ptr(30000), + Location: &LocationConfig{Country: "US", Languages: []string{"en"}}, + Actions: []ActionConfig{ + {Type: "wait", Milliseconds: ptr(500)}, + {Type: "click", Selector: ptr("#button")}, + }, + Proxy: ptr("basic"), + RemoveBase64Images: ptr(true), + ZeroDataRetention: ptr(true), + }) + require.NoError(t, err) + assert.NotNil(t, result) +} + +func TestScrapeURL_ServerError(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusInternalServerError, map[string]string{"error": "Internal failure"}) + }) + + _, err := app.ScrapeURL(context.Background(), "https://example.com", nil) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrServerError) +} + +func TestScrapeURL_RateLimited(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusTooManyRequests, map[string]string{"error": "Too many requests"}) + }) + + _, err := app.ScrapeURL(context.Background(), "https://example.com", nil) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrRateLimited) +} + +func TestScrapeURL_FailedResponse(t *testing.T) { + // The server returns 200 OK but success:false + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, ScrapeResponse{ + Success: false, + Data: nil, + }) + }) + + _, err := app.ScrapeURL(context.Background(), "https://example.com", nil) + assert.Error(t, err) + assert.Contains(t, err.Error(), "failed to scrape URL") +} + +func TestScrapeURL_InvalidJSON(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte(`{not valid json`)) + }) + + _, err := app.ScrapeURL(context.Background(), "https://example.com", nil) + assert.Error(t, err) + assert.Contains(t, err.Error(), "parse scrape response") +} + +func TestScrapeURL_ContextCancelled(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() // Cancel immediately + + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made with cancelled context") + }) + + _, err := app.ScrapeURL(ctx, "https://example.com", nil) + assert.Error(t, err) + assert.ErrorIs(t, err, context.Canceled) +} + +func TestScrapeURL_NilParams(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + var body map[string]any + decodeJSONBody(t, r, &body) + // Only url should be present when params is nil + assert.Equal(t, "https://example.com", body["url"]) + assert.Nil(t, body["formats"]) + assert.Nil(t, body["mobile"]) + + respondJSON(w, http.StatusOK, ScrapeResponse{ + Success: true, + Data: &FirecrawlDocument{Markdown: "# Hello"}, + }) + }) + + result, err := app.ScrapeURL(context.Background(), "https://example.com", nil) + require.NoError(t, err) + assert.NotNil(t, result) +} diff --git a/search.go b/search.go new file mode 100644 index 0000000..e875faa --- /dev/null +++ b/search.go @@ -0,0 +1,78 @@ +package firecrawl + +import ( + "context" + "encoding/json" + "fmt" + "net/http" +) + +// searchRequest is the internal request struct for search operations. +// It is unexported — callers use SearchParams instead. +type searchRequest struct { + Query string `json:"query"` + Limit *int `json:"limit,omitempty"` + Sources []string `json:"sources,omitempty"` + Categories []string `json:"categories,omitempty"` + TBS *string `json:"tbs,omitempty"` + Location *string `json:"location,omitempty"` + Country *string `json:"country,omitempty"` + Timeout *int `json:"timeout,omitempty"` + IgnoreInvalidURLs *bool `json:"ignoreInvalidURLs,omitempty"` + ScrapeOptions *ScrapeParams `json:"scrapeOptions,omitempty"` +} + +// Search performs a web search using the Firecrawl API. +// +// Parameters: +// - ctx: Context for cancellation and deadlines. +// - query: The search query string. +// - params: Optional search parameters. If nil, defaults are used. +// +// Returns: +// - *SearchResponse: The search results containing web, image, and news results. +// - error: An error if the search request fails. +func (app *FirecrawlApp) Search(ctx context.Context, query string, params *SearchParams) (*SearchResponse, error) { + headers := app.prepareHeaders(nil) + + req := searchRequest{Query: query} + if params != nil { + req.Limit = params.Limit + req.Sources = params.Sources + req.Categories = params.Categories + req.TBS = params.TBS + req.Location = params.Location + req.Country = params.Country + req.Timeout = params.Timeout + req.IgnoreInvalidURLs = params.IgnoreInvalidURLs + req.ScrapeOptions = params.ScrapeOptions + } + + body, err := json.Marshal(req) + if err != nil { + return nil, fmt.Errorf("failed to marshal search request: %w", err) + } + + resp, err := app.makeRequest( + ctx, + http.MethodPost, + fmt.Sprintf("%s/v2/search", app.APIURL), + body, + headers, + "search", + ) + if err != nil { + return nil, err + } + + var searchResponse SearchResponse + if err := json.Unmarshal(resp, &searchResponse); err != nil { + return nil, fmt.Errorf("failed to parse search response: %w", err) + } + + if !searchResponse.Success { + return nil, fmt.Errorf("search operation failed") + } + + return &searchResponse, nil +} diff --git a/search_test.go b/search_test.go new file mode 100644 index 0000000..bcc1e48 --- /dev/null +++ b/search_test.go @@ -0,0 +1,162 @@ +package firecrawl + +import ( + "context" + "net/http" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestSearch_Success(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, http.MethodPost, r.Method) + assert.Equal(t, "/v2/search", r.URL.Path) + assert.Equal(t, "Bearer fc-test-key", r.Header.Get("Authorization")) + + var body map[string]any + decodeJSONBody(t, r, &body) + assert.Equal(t, "golang tutorials", body["query"]) + + respondJSON(w, http.StatusOK, SearchResponse{ + Success: true, + Data: SearchData{ + Web: []SearchWebResult{ + {Title: "Go Tour", Description: "A tour of Go", URL: "https://go.dev/tour"}, + {Title: "Go Docs", Description: "Go documentation", URL: "https://pkg.go.dev"}, + }, + }, + CreditsUsed: 1, + }) + }) + + result, err := app.Search(context.Background(), "golang tutorials", nil) + require.NoError(t, err) + require.NotNil(t, result) + assert.True(t, result.Success) + assert.Len(t, result.Data.Web, 2) + assert.Equal(t, "Go Tour", result.Data.Web[0].Title) + assert.Equal(t, "https://go.dev/tour", result.Data.Web[0].URL) + assert.Equal(t, 1, result.CreditsUsed) +} + +func TestSearch_WithParams(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + var body map[string]any + decodeJSONBody(t, r, &body) + + assert.Equal(t, "go programming", body["query"]) + assert.NotNil(t, body["limit"]) + assert.Equal(t, float64(5), body["limit"]) + assert.Contains(t, body["sources"], "web") + assert.Contains(t, body["sources"], "news") + assert.Contains(t, body["categories"], "github") + assert.Equal(t, "qdr:d", body["tbs"]) + assert.Equal(t, "New York", body["location"]) + assert.Equal(t, "US", body["country"]) + assert.NotNil(t, body["timeout"]) + assert.Equal(t, true, body["ignoreInvalidURLs"]) + assert.NotNil(t, body["scrapeOptions"]) + + respondJSON(w, http.StatusOK, SearchResponse{ + Success: true, + Data: SearchData{ + Web: []SearchWebResult{{Title: "Result", Description: "Desc", URL: "https://example.com"}}, + News: []SearchNewsResult{{Title: "News", Snippet: "Snippet", URL: "https://news.example.com", Date: "2026-03-15", Position: 1}}, + }, + }) + }) + + params := &SearchParams{ + Limit: ptr(5), + Sources: []string{"web", "news"}, + Categories: []string{"github"}, + TBS: ptr("qdr:d"), + Location: ptr("New York"), + Country: ptr("US"), + Timeout: ptr(10000), + IgnoreInvalidURLs: ptr(true), + ScrapeOptions: &ScrapeParams{Formats: []string{"markdown"}}, + } + + result, err := app.Search(context.Background(), "go programming", params) + require.NoError(t, err) + require.NotNil(t, result) + assert.Len(t, result.Data.Web, 1) + assert.Len(t, result.Data.News, 1) +} + +func TestSearch_EmptyQuery(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + var body map[string]any + decodeJSONBody(t, r, &body) + // Empty string query is sent — the API decides whether to accept it + assert.Equal(t, "", body["query"]) + + respondJSON(w, http.StatusOK, SearchResponse{ + Success: true, + Data: SearchData{}, + }) + }) + + result, err := app.Search(context.Background(), "", nil) + require.NoError(t, err) + assert.NotNil(t, result) +} + +func TestSearch_Unauthorized(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusUnauthorized, map[string]string{"error": "Invalid API key"}) + }) + + _, err := app.Search(context.Background(), "test query", nil) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrUnauthorized) +} + +func TestSearch_ServerError(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusInternalServerError, map[string]string{"error": "Internal server error"}) + }) + + _, err := app.Search(context.Background(), "test query", nil) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrServerError) +} + +func TestSearch_FailedResponse(t *testing.T) { + // Server returns 200 OK but success:false + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusOK, SearchResponse{ + Success: false, + }) + }) + + _, err := app.Search(context.Background(), "test query", nil) + assert.Error(t, err) + assert.Contains(t, err.Error(), "search operation failed") +} + +func TestSearch_RateLimited(t *testing.T) { + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + respondJSON(w, http.StatusTooManyRequests, map[string]string{"error": "rate limit exceeded"}) + }) + + _, err := app.Search(context.Background(), "test query", nil) + assert.Error(t, err) + assert.ErrorIs(t, err, ErrRateLimited) +} + +func TestSearch_ContextCancelled(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() // Cancel before making any request + + app, _ := newMockServer(t, func(w http.ResponseWriter, r *http.Request) { + t.Fatal("request should not be made with cancelled context") + }) + + _, err := app.Search(ctx, "test query", nil) + assert.Error(t, err) + assert.ErrorIs(t, err, context.Canceled) +} diff --git a/security.go b/security.go new file mode 100644 index 0000000..3ef5ba4 --- /dev/null +++ b/security.go @@ -0,0 +1,38 @@ +package firecrawl + +import ( + "fmt" + "net/url" + + "github.com/google/uuid" +) + +// validatePaginationURL ensures a Next pagination URL points to the same host +// as the SDK's configured API URL, preventing SSRF attacks via malicious Next +// URLs in API responses. +func validatePaginationURL(baseURL, nextURL string) error { + base, err := url.Parse(baseURL) + if err != nil { + return fmt.Errorf("invalid base URL: %w", err) + } + + next, err := url.Parse(nextURL) + if err != nil { + return fmt.Errorf("invalid pagination URL: %w", err) + } + + if next.Host != base.Host { + return fmt.Errorf("pagination URL host %q does not match API host %q", next.Host, base.Host) + } + + return nil +} + +// validateJobID ensures a job ID is a valid UUID, preventing path injection +// attacks via crafted IDs like "../../admin". +func validateJobID(id string) error { + if _, err := uuid.Parse(id); err != nil { + return fmt.Errorf("invalid job ID %q: must be a valid UUID: %w", id, err) + } + return nil +} diff --git a/security_test.go b/security_test.go new file mode 100644 index 0000000..b0b3df4 --- /dev/null +++ b/security_test.go @@ -0,0 +1,205 @@ +package firecrawl + +import ( + "bytes" + "log" + "strings" + "testing" +) + +// ---- validatePaginationURL ---- + +func TestValidatePaginationURL_SameHost(t *testing.T) { + err := validatePaginationURL( + "https://api.firecrawl.dev", + "https://api.firecrawl.dev/v2/crawl/abc123?cursor=2", + ) + if err != nil { + t.Fatalf("expected no error for matching hosts, got: %v", err) + } +} + +func TestValidatePaginationURL_DifferentHost(t *testing.T) { + err := validatePaginationURL( + "https://api.firecrawl.dev", + "https://attacker.example.com/steal-token", + ) + if err == nil { + t.Fatal("expected error for mismatched hosts, got nil") + } + if !strings.Contains(err.Error(), "does not match API host") { + t.Fatalf("expected host mismatch error, got: %v", err) + } +} + +func TestValidatePaginationURL_EmptyNextURL(t *testing.T) { + // An empty string parses to a URL with no host — should fail since base has a host. + err := validatePaginationURL( + "https://api.firecrawl.dev", + "", + ) + if err == nil { + t.Fatal("expected error for empty next URL, got nil") + } +} + +func TestValidatePaginationURL_RelativeURL(t *testing.T) { + // A relative URL has no host — should fail host comparison. + err := validatePaginationURL( + "https://api.firecrawl.dev", + "/v2/crawl/abc123?cursor=2", + ) + if err == nil { + t.Fatal("expected error for relative URL (no host), got nil") + } + if !strings.Contains(err.Error(), "does not match API host") { + t.Fatalf("expected host mismatch error, got: %v", err) + } +} + +// ---- validateJobID ---- + +func TestValidateJobID_ValidUUID(t *testing.T) { + err := validateJobID("550e8400-e29b-41d4-a716-446655440000") + if err != nil { + t.Fatalf("expected no error for valid UUID, got: %v", err) + } +} + +func TestValidateJobID_InvalidString(t *testing.T) { + err := validateJobID("not-a-uuid") + if err == nil { + t.Fatal("expected error for non-UUID string, got nil") + } + if !strings.Contains(err.Error(), "must be a valid UUID") { + t.Fatalf("expected UUID error message, got: %v", err) + } +} + +func TestValidateJobID_PathTraversal(t *testing.T) { + err := validateJobID("../../admin") + if err == nil { + t.Fatal("expected error for path traversal string, got nil") + } + if !strings.Contains(err.Error(), "must be a valid UUID") { + t.Fatalf("expected UUID error message, got: %v", err) + } +} + +func TestValidateJobID_EmptyString(t *testing.T) { + err := validateJobID("") + if err == nil { + t.Fatal("expected error for empty string, got nil") + } + if !strings.Contains(err.Error(), "must be a valid UUID") { + t.Fatalf("expected UUID error message, got: %v", err) + } +} + +// ---- FirecrawlApp.String() redaction ---- + +func TestFirecrawlApp_String_Redaction(t *testing.T) { + app := &FirecrawlApp{ + apiKey: "fc-abcdefghijklmnop", + APIURL: "https://api.firecrawl.dev", + } + s := app.String() + if strings.Contains(s, "fc-abcdefghijklmnop") { + t.Fatalf("String() should redact the API key, but found full key in: %s", s) + } + // Should show first 3 chars and last 4 chars. + if !strings.Contains(s, "fc-") { + t.Fatalf("String() should show first 3 chars, got: %s", s) + } + if !strings.Contains(s, "mnop") { + t.Fatalf("String() should show last 4 chars, got: %s", s) + } + if !strings.Contains(s, "...") { + t.Fatalf("String() should contain '...', got: %s", s) + } +} + +func TestFirecrawlApp_String_ShortKey(t *testing.T) { + app := &FirecrawlApp{ + apiKey: "mykey", + APIURL: "https://api.firecrawl.dev", + } + s := app.String() + // Short keys (<=7 chars) get fully replaced with "***". + if strings.Contains(s, "mykey") { + t.Fatalf("String() should redact short keys, but found full key in: %s", s) + } + if !strings.Contains(s, "***") { + t.Fatalf("String() should use '***' for short keys, got: %s", s) + } +} + +// ---- APIKey() accessor ---- + +func TestFirecrawlApp_APIKey_Accessor(t *testing.T) { + app := &FirecrawlApp{ + apiKey: "fc-test-key-1234", + APIURL: "https://api.firecrawl.dev", + } + if app.APIKey() != "fc-test-key-1234" { + t.Fatalf("APIKey() returned %q, want %q", app.APIKey(), "fc-test-key-1234") + } +} + +// ---- HTTPS warning in NewFirecrawlApp ---- + +func TestNewFirecrawlApp_HTTPWarning(t *testing.T) { + var buf bytes.Buffer + log.SetOutput(&buf) + defer log.SetOutput(nil) // restore default output after test + + _, err := NewFirecrawlApp("test-key", "http://remote.example.com") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + logOutput := buf.String() + if !strings.Contains(logOutput, "WARNING") { + t.Fatalf("expected WARNING log for non-localhost HTTP URL, got: %q", logOutput) + } + if !strings.Contains(logOutput, "cleartext") { + t.Fatalf("expected cleartext warning in log, got: %q", logOutput) + } +} + +func TestNewFirecrawlApp_HTTPSNoWarning(t *testing.T) { + var buf bytes.Buffer + log.SetOutput(&buf) + defer log.SetOutput(nil) + + _, err := NewFirecrawlApp("test-key", "https://api.firecrawl.dev") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + logOutput := buf.String() + if strings.Contains(logOutput, "WARNING") { + t.Fatalf("expected no WARNING for HTTPS URL, got: %q", logOutput) + } +} + +func TestNewFirecrawlApp_HTTPLocalhostNoWarning(t *testing.T) { + var buf bytes.Buffer + log.SetOutput(&buf) + defer log.SetOutput(nil) + + for _, host := range []string{ + "http://localhost:8080", + "http://127.0.0.1:3000", + } { + buf.Reset() + _, err := NewFirecrawlApp("test-key", host) + if err != nil { + t.Fatalf("unexpected error for %s: %v", host, err) + } + logOutput := buf.String() + if strings.Contains(logOutput, "WARNING") { + t.Fatalf("expected no WARNING for localhost URL %s, got: %q", host, logOutput) + } + } +} diff --git a/testhelpers_test.go b/testhelpers_test.go new file mode 100644 index 0000000..ad5e98e --- /dev/null +++ b/testhelpers_test.go @@ -0,0 +1,40 @@ +package firecrawl + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/require" +) + +// newMockServer creates a test HTTP server and a FirecrawlApp configured to use it. +// The server is automatically cleaned up when the test completes. +func newMockServer(t *testing.T, handler http.HandlerFunc) (*FirecrawlApp, *httptest.Server) { + t.Helper() + server := httptest.NewServer(handler) + t.Cleanup(server.Close) + app, err := NewFirecrawlApp("fc-test-key", server.URL) + require.NoError(t, err) + return app, server +} + +// respondJSON writes a JSON response with the given status code. +func respondJSON(w http.ResponseWriter, statusCode int, v any) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(statusCode) + _ = json.NewEncoder(w).Encode(v) //nolint:gosec +} + +// decodeJSONBody decodes the request body into the given pointer. +func decodeJSONBody(t *testing.T, r *http.Request, v any) { + t.Helper() + err := json.NewDecoder(r.Body).Decode(v) + require.NoError(t, err, "failed to decode request body") +} + +// ptr returns a pointer to the given value. Useful for constructing test params. +func ptr[T any](v T) *T { + return &v +} diff --git a/types.go b/types.go new file mode 100644 index 0000000..740aa05 --- /dev/null +++ b/types.go @@ -0,0 +1,549 @@ +package firecrawl + +import ( + "encoding/json" + "fmt" +) + +// StringOrStringSlice is a type that can unmarshal either a JSON string or a JSON array of strings. +type StringOrStringSlice []string + +func (s *StringOrStringSlice) UnmarshalJSON(data []byte) error { + var single string + if err := json.Unmarshal(data, &single); err == nil { + *s = []string{single} + return nil + } + + var list []string + if err := json.Unmarshal(data, &list); err == nil { + *s = list + return nil + } + + return fmt.Errorf("field is neither a string nor a list of strings") +} + +// FirecrawlDocumentMetadata represents metadata for a Firecrawl document. +type FirecrawlDocumentMetadata struct { + Title *string `json:"title,omitempty"` + Description *StringOrStringSlice `json:"description,omitempty"` + Language *StringOrStringSlice `json:"language,omitempty"` + Keywords *StringOrStringSlice `json:"keywords,omitempty"` + Robots *StringOrStringSlice `json:"robots,omitempty"` + OGTitle *StringOrStringSlice `json:"ogTitle,omitempty"` + OGDescription *StringOrStringSlice `json:"ogDescription,omitempty"` + OGURL *StringOrStringSlice `json:"ogUrl,omitempty"` + OGImage *StringOrStringSlice `json:"ogImage,omitempty"` + OGAudio *StringOrStringSlice `json:"ogAudio,omitempty"` + OGDeterminer *StringOrStringSlice `json:"ogDeterminer,omitempty"` + OGLocale *StringOrStringSlice `json:"ogLocale,omitempty"` + OGLocaleAlternate []*string `json:"ogLocaleAlternate,omitempty"` + OGSiteName *StringOrStringSlice `json:"ogSiteName,omitempty"` + OGVideo *StringOrStringSlice `json:"ogVideo,omitempty"` + DCTermsCreated *StringOrStringSlice `json:"dctermsCreated,omitempty"` + DCDateCreated *StringOrStringSlice `json:"dcDateCreated,omitempty"` + DCDate *StringOrStringSlice `json:"dcDate,omitempty"` + DCTermsType *StringOrStringSlice `json:"dctermsType,omitempty"` + DCType *StringOrStringSlice `json:"dcType,omitempty"` + DCTermsAudience *StringOrStringSlice `json:"dctermsAudience,omitempty"` + DCTermsSubject *StringOrStringSlice `json:"dctermsSubject,omitempty"` + DCSubject *StringOrStringSlice `json:"dcSubject,omitempty"` + DCDescription *StringOrStringSlice `json:"dcDescription,omitempty"` + DCTermsKeywords *StringOrStringSlice `json:"dctermsKeywords,omitempty"` + ModifiedTime *StringOrStringSlice `json:"modifiedTime,omitempty"` + PublishedTime *StringOrStringSlice `json:"publishedTime,omitempty"` + ArticleTag *StringOrStringSlice `json:"articleTag,omitempty"` + ArticleSection *StringOrStringSlice `json:"articleSection,omitempty"` + URL *string `json:"url,omitempty"` + ScrapeID *string `json:"scrapeId,omitempty"` + SourceURL *string `json:"sourceURL,omitempty"` + StatusCode *int `json:"statusCode,omitempty"` + Error *string `json:"error,omitempty"` +} + +// JsonOptions represents the options for JSON extraction. +type JsonOptions struct { + // Schema is an optional JSON schema for structured data extraction. + Schema map[string]any `json:"schema,omitempty"` + // SystemPrompt is an optional system-level prompt for the LLM. + SystemPrompt *string `json:"systemPrompt,omitempty"` + // Prompt is an optional user-level prompt for the LLM. + Prompt *string `json:"prompt,omitempty"` +} + +// LocationConfig represents geolocation settings for requests. +type LocationConfig struct { + // Country is the ISO 3166-1 alpha-2 country code (e.g., "US", "GB"). + Country string `json:"country,omitempty"` + // Languages is the list of BCP-47 language codes to prefer (e.g., ["en", "en-US"]). + Languages []string `json:"languages,omitempty"` +} + +// ParserConfig represents parser configuration for document parsing. +// It replaces the v1 ParsePDF field. Use Type "pdf" to parse PDF documents. +type ParserConfig struct { + // Type is the parser type (e.g., "pdf"). + Type string `json:"type"` + // Mode is the optional parsing mode (e.g., "auto", "ocr"). + Mode *string `json:"mode,omitempty"` + // MaxPages is the optional maximum number of pages to parse. + MaxPages *int `json:"maxPages,omitempty"` +} + +// ActionConfig represents a browser action to execute during scraping. +// The Type field is a discriminator: "wait", "click", "write", "press", +// "scroll", "screenshot", "scrape", "executeJavascript", "pdf". +// Type-specific fields are optional and only apply to relevant action types. +type ActionConfig struct { + // Type is the action discriminator (required). + Type string `json:"type"` + // Milliseconds is the duration for "wait" actions. + Milliseconds *int `json:"milliseconds,omitempty"` + // Selector is the CSS selector for "click" and "write" actions. + Selector *string `json:"selector,omitempty"` + // Text is the text to write for "write" actions. + Text *string `json:"text,omitempty"` + // Key is the key to press for "press" actions (e.g., "Enter"). + Key *string `json:"key,omitempty"` + // Direction is the scroll direction for "scroll" actions ("up" or "down"). + Direction *string `json:"direction,omitempty"` + // Amount is the scroll amount in pixels for "scroll" actions. + Amount *int `json:"amount,omitempty"` + // Script is the JavaScript source code for "executeJavascript" actions. + Script *string `json:"script,omitempty"` + // FullPage captures the full page for "screenshot" actions. + FullPage *bool `json:"fullPage,omitempty"` +} + +// WebhookConfig represents webhook configuration for async operations. +type WebhookConfig struct { + // URL is the webhook endpoint URL (required). + URL string `json:"url"` + // Headers are optional custom HTTP headers to send with webhook requests. + Headers map[string]string `json:"headers,omitempty"` + // Metadata is optional arbitrary metadata to include in the webhook payload. + Metadata map[string]any `json:"metadata,omitempty"` + // Events is the list of event types to subscribe to (e.g., "completed", "page", "failed"). + Events []string `json:"events,omitempty"` +} + +// ActionsResult contains the results of browser actions executed during scraping. +type ActionsResult struct { + // Screenshots contains base64-encoded screenshots from "screenshot" actions. + Screenshots []string `json:"screenshots,omitempty"` + // Scrapes contains scraped documents from "scrape" actions. + Scrapes []FirecrawlDocument `json:"scrapes,omitempty"` + // JavascriptReturns contains return values from "executeJavascript" actions. + JavascriptReturns []any `json:"javascriptReturns,omitempty"` + // PDFs contains base64-encoded PDF data from "pdf" actions. + PDFs []string `json:"pdfs,omitempty"` +} + +// ChangeTrackingResult contains change tracking information between consecutive scrapes. +type ChangeTrackingResult struct { + // PreviousScrapeAt is the RFC3339 timestamp of the previous scrape used for comparison. + PreviousScrapeAt *string `json:"previousScrapeAt,omitempty"` + // ChangeStatus indicates whether the page changed ("changed", "unchanged", "new"). + ChangeStatus *string `json:"changeStatus,omitempty"` + // Visibility indicates the page visibility status. + Visibility *string `json:"visibility,omitempty"` + // Diff is the text diff between the current and previous scrape. + Diff *string `json:"diff,omitempty"` + // JSON contains the structured diff data. + JSON map[string]any `json:"json,omitempty"` +} + +// BrandingResult contains extracted branding information from a page. +type BrandingResult struct { + // ColorScheme is the detected color scheme ("light" or "dark"). + ColorScheme *string `json:"colorScheme,omitempty"` + // Logo is the URL of the detected logo image. + Logo *string `json:"logo,omitempty"` + // Colors contains extracted color values keyed by role (e.g., "primary", "background"). + Colors map[string]any `json:"colors,omitempty"` + // Fonts contains extracted font information keyed by role (e.g., "heading", "body"). + Fonts map[string]any `json:"fonts,omitempty"` +} + +// FirecrawlDocument represents a scraped document returned by the Firecrawl API. +type FirecrawlDocument struct { + // Markdown is the page content rendered as Markdown. + Markdown string `json:"markdown,omitempty"` + // HTML is the page content as cleaned HTML. + HTML string `json:"html,omitempty"` + // RawHTML is the raw, unprocessed HTML of the page. + RawHTML string `json:"rawHtml,omitempty"` + // Screenshot is the base64-encoded screenshot of the page. + Screenshot string `json:"screenshot,omitempty"` + // JSON contains structured data extracted according to JsonOptions. + JSON map[string]any `json:"json,omitempty"` + // Links is a list of URLs found on the page. + Links []string `json:"links,omitempty"` + // Metadata contains page metadata (title, OG tags, HTTP status, etc.). + Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"` + // Summary is a generated summary of the page content. + Summary *string `json:"summary,omitempty"` + // Images is a list of image URLs found on the page. + Images []string `json:"images,omitempty"` + // Actions contains the results of browser actions executed during scraping. + Actions *ActionsResult `json:"actions,omitempty"` + // Warning is a non-fatal warning message from the scrape operation. + Warning *string `json:"warning,omitempty"` + // ChangeTracking contains change tracking information if the "changeTracking" format was requested. + ChangeTracking *ChangeTrackingResult `json:"changeTracking,omitempty"` + // Branding contains extracted branding information if the "branding" format was requested. + Branding *BrandingResult `json:"branding,omitempty"` +} + +// ScrapeParams represents the parameters for a scrape request. +type ScrapeParams struct { + // Formats specifies which output formats to return (e.g., "markdown", "html", "rawHtml", + // "screenshot", "json", "links", "summary", "images", "changeTracking", "branding"). + Formats []string `json:"formats,omitempty"` + // Headers are custom HTTP headers to send with the request. + Headers *map[string]string `json:"headers,omitempty"` + // IncludeTags limits HTML parsing to only these CSS selectors. + IncludeTags []string `json:"includeTags,omitempty"` + // ExcludeTags removes these CSS selectors from the parsed output. + ExcludeTags []string `json:"excludeTags,omitempty"` + // OnlyMainContent strips navigation, footers, and sidebars when true. + OnlyMainContent *bool `json:"onlyMainContent,omitempty"` + // WaitFor is the number of milliseconds to wait after page load before scraping. + WaitFor *int `json:"waitFor,omitempty"` + // Timeout is the maximum time in milliseconds to wait for the page to load. + Timeout *int `json:"timeout,omitempty"` + // MaxAge is the maximum age in milliseconds of a cached result to accept. + MaxAge *int `json:"maxAge,omitempty"` + // MinAge is the minimum age in milliseconds of a cached result to accept. + MinAge *int `json:"minAge,omitempty"` + // JsonOptions configures LLM-based JSON extraction. + JsonOptions *JsonOptions `json:"jsonOptions,omitempty"` + // Mobile emulates a mobile browser when true. + Mobile *bool `json:"mobile,omitempty"` + // SkipTlsVerification skips TLS certificate verification when true. + SkipTlsVerification *bool `json:"skipTlsVerification,omitempty"` + // BlockAds blocks ads and tracking scripts when true. + BlockAds *bool `json:"blockAds,omitempty"` + // Proxy selects the proxy tier to use ("basic", "enhanced", or "auto"). + Proxy *string `json:"proxy,omitempty"` + // Location configures the geolocation for the request. + Location *LocationConfig `json:"location,omitempty"` + // Parsers configures document parsers. Use Type "pdf" to replace the v1 ParsePDF flag. + Parsers []ParserConfig `json:"parsers,omitempty"` + // Actions is a list of browser actions to execute before scraping. + Actions []ActionConfig `json:"actions,omitempty"` + // RemoveBase64Images strips base64-encoded inline images from the output when true. + RemoveBase64Images *bool `json:"removeBase64Images,omitempty"` + // StoreInCache stores the scrape result in the Firecrawl cache when true. + StoreInCache *bool `json:"storeInCache,omitempty"` + // ZeroDataRetention prevents Firecrawl from retaining scraped data when true. + ZeroDataRetention *bool `json:"zeroDataRetention,omitempty"` + // ParsePDF is removed in v2 — use Parsers: []ParserConfig{{Type: "pdf"}} instead. + // Deprecated: removed in v2. +} + +// ScrapeResponse represents the response for a scrape operation. +type ScrapeResponse struct { + Success bool `json:"success"` + Data *FirecrawlDocument `json:"data,omitempty"` +} + +// CrawlParams represents the parameters for a crawl request. +type CrawlParams struct { + // ScrapeOptions configures how each page is scraped during the crawl. + ScrapeOptions ScrapeParams `json:"scrapeOptions,omitempty"` + // Webhook configures the webhook endpoint to receive crawl events. + Webhook *WebhookConfig `json:"webhook,omitempty"` + // Limit is the maximum number of pages to crawl (default 10000). + Limit *int `json:"limit,omitempty"` + // IncludePaths restricts crawling to URLs matching these path patterns. + IncludePaths []string `json:"includePaths,omitempty"` + // ExcludePaths skips URLs matching these path patterns. + ExcludePaths []string `json:"excludePaths,omitempty"` + // AllowExternalLinks allows following links to external domains when true. + AllowExternalLinks *bool `json:"allowExternalLinks,omitempty"` + // IgnoreQueryParameters ignores URL query parameters when deduplicating pages. + IgnoreQueryParameters *bool `json:"ignoreQueryParameters,omitempty"` + // MaxDiscoveryDepth is the maximum link depth from the seed URL to follow. Replaces v1 MaxDepth. + MaxDiscoveryDepth *int `json:"maxDiscoveryDepth,omitempty"` + // Sitemap controls sitemap behavior: "skip", "include", or "only". Replaces v1 IgnoreSitemap. + Sitemap *string `json:"sitemap,omitempty"` + // CrawlEntireDomain follows links back to previously visited pages when true. Replaces v1 AllowBackwardLinks. + CrawlEntireDomain *bool `json:"crawlEntireDomain,omitempty"` + // AllowSubdomains allows crawling subdomains of the seed URL when true. + AllowSubdomains *bool `json:"allowSubdomains,omitempty"` + // Delay is the number of seconds to wait between page scrapes. + Delay *float64 `json:"delay,omitempty"` + // MaxConcurrency is the maximum number of pages to scrape concurrently. + MaxConcurrency *int `json:"maxConcurrency,omitempty"` + // Prompt is a natural language description of which pages to crawl. + Prompt *string `json:"prompt,omitempty"` + // RegexOnFullURL applies include/exclude path patterns to the full URL when true. + RegexOnFullURL *bool `json:"regexOnFullURL,omitempty"` + // ZeroDataRetention prevents Firecrawl from retaining crawled data when true. + ZeroDataRetention *bool `json:"zeroDataRetention,omitempty"` +} + +// CrawlResponse represents the initial response when starting a crawl job. +type CrawlResponse struct { + Success bool `json:"success"` + ID string `json:"id,omitempty"` + URL string `json:"url,omitempty"` +} + +// CrawlStatusResponse represents the status of an in-progress or completed crawl job. +// v2 status values are: "scraping", "completed", "failed". +type CrawlStatusResponse struct { + // Status is the current crawl status ("scraping", "completed", "failed"). + Status string `json:"status"` + // Total is the total number of pages discovered. + Total int `json:"total,omitempty"` + // Completed is the number of pages scraped so far. + Completed int `json:"completed,omitempty"` + // CreditsUsed is the number of API credits consumed by the crawl. + CreditsUsed int `json:"creditsUsed,omitempty"` + // ExpiresAt is the RFC3339 timestamp when the crawl result expires. + ExpiresAt string `json:"expiresAt,omitempty"` + // Next is the URL of the next results page for paginated crawl status responses. + Next *string `json:"next,omitempty"` + // Data contains the scraped documents for the current results page. + Data []*FirecrawlDocument `json:"data,omitempty"` +} + +// CancelCrawlJobResponse represents the response for canceling a crawl job. +type CancelCrawlJobResponse struct { + Success bool `json:"success"` + Status string `json:"status"` +} + +// MapLink represents a link object in the v2 Map response. +// v2 returns rich link objects instead of plain strings. +type MapLink struct { + // URL is the absolute URL of the discovered link. + URL string `json:"url"` + // Title is the optional page title of the linked page. + Title *string `json:"title,omitempty"` + // Description is the optional meta description of the linked page. + Description *string `json:"description,omitempty"` +} + +// MapParams represents the parameters for a map request. +type MapParams struct { + // IncludeSubdomains includes links to subdomains of the target URL when true. + IncludeSubdomains *bool `json:"includeSubdomains,omitempty"` + // Search filters the map results to URLs containing this search term. + Search *string `json:"search,omitempty"` + // Limit is the maximum number of links to return (default 5000, max 100000). + Limit *int `json:"limit,omitempty"` + // Sitemap controls sitemap behavior: "skip", "include", or "only". Replaces v1 IgnoreSitemap. + Sitemap *string `json:"sitemap,omitempty"` + // IgnoreQueryParameters ignores URL query parameters when deduplicating links. + IgnoreQueryParameters *bool `json:"ignoreQueryParameters,omitempty"` + // IgnoreCache bypasses the Firecrawl cache and re-fetches the sitemap/pages. + IgnoreCache *bool `json:"ignoreCache,omitempty"` + // Timeout is the maximum time in milliseconds for the map operation. + Timeout *int `json:"timeout,omitempty"` + // Location configures the geolocation for the map request. + Location *LocationConfig `json:"location,omitempty"` +} + +// MapResponse represents the response for a map operation. +type MapResponse struct { + Success bool `json:"success"` + Links []MapLink `json:"links,omitempty"` + Error string `json:"error,omitempty"` +} + +// PaginationConfig controls pagination behavior for status-checking methods. +type PaginationConfig struct { + // AutoPaginate automatically follows "next" URLs and aggregates results when true. + AutoPaginate *bool `json:"autoPaginate,omitempty"` + // MaxPages is the maximum number of result pages to fetch during auto-pagination. + MaxPages *int `json:"maxPages,omitempty"` + // MaxResults is the maximum total number of results to collect during auto-pagination. + MaxResults *int `json:"maxResults,omitempty"` + // MaxWaitTime is the maximum number of seconds to spend polling before giving up. + MaxWaitTime *int `json:"maxWaitTime,omitempty"` +} + +// SearchParams represents the parameters for a search request. +type SearchParams struct { + // Limit is the maximum number of results to return. + Limit *int `json:"limit,omitempty"` + // Sources specifies which result types to include ("web", "images", "news"). + Sources []string `json:"sources,omitempty"` + // Categories restricts results to specific content categories (e.g., "github", "research", "pdf"). + Categories []string `json:"categories,omitempty"` + // TBS is the time-based search filter (e.g., "qdr:d" for past day, "qdr:w" for past week). + TBS *string `json:"tbs,omitempty"` + // Location is the geographic location to use for localized search results. + Location *string `json:"location,omitempty"` + // Country is the ISO 3166-1 alpha-2 country code for the search context. + Country *string `json:"country,omitempty"` + // Timeout is the maximum time in milliseconds for the search operation. + Timeout *int `json:"timeout,omitempty"` + // IgnoreInvalidURLs skips invalid URLs in results rather than failing. + IgnoreInvalidURLs *bool `json:"ignoreInvalidURLs,omitempty"` + // ScrapeOptions configures how result pages are scraped when content is requested. + ScrapeOptions *ScrapeParams `json:"scrapeOptions,omitempty"` +} + +// SearchWebResult represents a single web search result. +type SearchWebResult struct { + // Title is the page title of the result. + Title string `json:"title"` + // Description is the snippet or meta description of the result. + Description string `json:"description"` + // URL is the URL of the result. + URL string `json:"url"` + // Markdown is the scraped Markdown content (present when scrapeOptions includes "markdown"). + Markdown *string `json:"markdown,omitempty"` + // HTML is the scraped HTML content (present when scrapeOptions includes "html"). + HTML *string `json:"html,omitempty"` + // Metadata contains page metadata for the result. + Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"` +} + +// SearchImageResult represents a single image search result. +type SearchImageResult struct { + // Title is the title or alt text of the image. + Title string `json:"title"` + // ImageURL is the direct URL of the image. + ImageURL string `json:"imageUrl"` + // ImageWidth is the width of the image in pixels. + ImageWidth int `json:"imageWidth"` + // ImageHeight is the height of the image in pixels. + ImageHeight int `json:"imageHeight"` + // URL is the URL of the page containing the image. + URL string `json:"url"` + // Position is the 1-based rank of this result in the search response. + Position int `json:"position"` +} + +// SearchNewsResult represents a single news search result. +type SearchNewsResult struct { + // Title is the headline of the news article. + Title string `json:"title"` + // Snippet is a short excerpt from the news article. + Snippet string `json:"snippet"` + // URL is the URL of the news article. + URL string `json:"url"` + // Date is the publication date of the news article. + Date string `json:"date"` + // ImageURL is the optional URL of the article's featured image. + ImageURL *string `json:"imageUrl,omitempty"` + // Position is the 1-based rank of this result in the search response. + Position int `json:"position"` +} + +// SearchData contains categorized search results. +type SearchData struct { + // Web contains web search results. + Web []SearchWebResult `json:"web,omitempty"` + // Images contains image search results. + Images []SearchImageResult `json:"images,omitempty"` + // News contains news search results. + News []SearchNewsResult `json:"news,omitempty"` +} + +// SearchResponse represents the response for a search operation. +type SearchResponse struct { + // Success indicates whether the search request succeeded. + Success bool `json:"success"` + // Data contains the categorized search results. + Data SearchData `json:"data"` + // Warning is a non-fatal warning message from the search operation. + Warning *string `json:"warning,omitempty"` + // ID is the unique identifier for this search request. + ID string `json:"id,omitempty"` + // CreditsUsed is the number of API credits consumed by this search. + CreditsUsed int `json:"creditsUsed,omitempty"` +} + +// BatchScrapeParams represents the parameters for a batch scrape request. +type BatchScrapeParams struct { + // ScrapeOptions configures how each URL is scraped. + ScrapeOptions ScrapeParams `json:"scrapeOptions,omitempty"` + // MaxConcurrency is the maximum number of URLs to scrape concurrently. + MaxConcurrency *int `json:"maxConcurrency,omitempty"` + // IgnoreInvalidURLs skips invalid URLs rather than failing the entire batch. + IgnoreInvalidURLs *bool `json:"ignoreInvalidURLs,omitempty"` + // Webhook configures the webhook endpoint to receive batch scrape events. + Webhook *WebhookConfig `json:"webhook,omitempty"` +} + +// BatchScrapeResponse represents the initial response when starting a batch scrape job. +type BatchScrapeResponse struct { + // Success indicates whether the batch scrape job was started successfully. + Success bool `json:"success"` + // ID is the job identifier for polling status. + ID string `json:"id,omitempty"` + // URL is the polling URL for checking job status. + URL string `json:"url,omitempty"` + // InvalidURLs lists any URLs that were rejected before the job started. + InvalidURLs []string `json:"invalidURLs,omitempty"` +} + +// BatchScrapeStatusResponse represents the status of an in-progress or completed batch scrape job. +type BatchScrapeStatusResponse struct { + // Status is the current job status ("scraping", "completed", "failed"). + Status string `json:"status"` + // Total is the total number of URLs in the batch. + Total int `json:"total,omitempty"` + // Completed is the number of URLs scraped so far. + Completed int `json:"completed,omitempty"` + // CreditsUsed is the number of API credits consumed by the batch. + CreditsUsed int `json:"creditsUsed,omitempty"` + // ExpiresAt is the RFC3339 timestamp when the batch result expires. + ExpiresAt string `json:"expiresAt,omitempty"` + // Next is the URL of the next results page for paginated status responses. + Next *string `json:"next,omitempty"` + // Data contains the scraped documents for the current results page. + Data []*FirecrawlDocument `json:"data,omitempty"` +} + +// ExtractParams represents the parameters for an extract request. +// Extract performs LLM-based structured data extraction from one or more URLs. +type ExtractParams struct { + // Prompt is a natural language description of the data to extract. + Prompt *string `json:"prompt,omitempty"` + // Schema is a JSON Schema definition for the structured output. + Schema map[string]any `json:"schema,omitempty"` + // EnableWebSearch augments extraction with web search when true. + EnableWebSearch *bool `json:"enableWebSearch,omitempty"` + // IgnoreSitemap skips sitemap discovery and only processes the provided URLs. + IgnoreSitemap *bool `json:"ignoreSitemap,omitempty"` + // IncludeSubdomains includes subdomains of the provided URLs in extraction. + IncludeSubdomains *bool `json:"includeSubdomains,omitempty"` + // ShowSources includes source attribution in the extraction result. + ShowSources *bool `json:"showSources,omitempty"` + // IgnoreInvalidURLs skips invalid URLs rather than failing the extraction. + IgnoreInvalidURLs *bool `json:"ignoreInvalidURLs,omitempty"` + // ScrapeOptions configures how pages are scraped before extraction. + ScrapeOptions *ScrapeParams `json:"scrapeOptions,omitempty"` +} + +// ExtractResponse represents the initial response when starting an extract job. +type ExtractResponse struct { + // Success indicates whether the extract job was started successfully. + Success bool `json:"success"` + // ID is the job identifier for polling status. + ID string `json:"id,omitempty"` + // InvalidURLs lists any URLs that were rejected before the job started. + InvalidURLs []string `json:"invalidURLs,omitempty"` +} + +// ExtractStatusResponse represents the status of an in-progress or completed extract job. +type ExtractStatusResponse struct { + // Success indicates whether the extraction succeeded. + Success bool `json:"success"` + // Status is the current job status ("processing", "completed", "failed"). + Status string `json:"status"` + // Data contains the extracted structured data upon completion. + Data map[string]any `json:"data,omitempty"` + // ExpiresAt is the RFC3339 timestamp when the extract result expires. + ExpiresAt string `json:"expiresAt,omitempty"` + // CreditsUsed is the number of API credits consumed by the extraction. + CreditsUsed int `json:"creditsUsed,omitempty"` +} diff --git a/types_test.go b/types_test.go new file mode 100644 index 0000000..21b9c35 --- /dev/null +++ b/types_test.go @@ -0,0 +1,64 @@ +package firecrawl + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestStringOrStringSlice_SingleString(t *testing.T) { + var s StringOrStringSlice + err := s.UnmarshalJSON([]byte(`"hello"`)) + require.NoError(t, err) + assert.Equal(t, StringOrStringSlice{"hello"}, s) +} + +func TestStringOrStringSlice_StringArray(t *testing.T) { + var s StringOrStringSlice + err := s.UnmarshalJSON([]byte(`["a","b","c"]`)) + require.NoError(t, err) + assert.Equal(t, StringOrStringSlice{"a", "b", "c"}, s) +} + +func TestStringOrStringSlice_EmptyArray(t *testing.T) { + var s StringOrStringSlice + err := s.UnmarshalJSON([]byte(`[]`)) + require.NoError(t, err) + assert.Equal(t, StringOrStringSlice{}, s) +} + +func TestStringOrStringSlice_EmptyString(t *testing.T) { + var s StringOrStringSlice + err := s.UnmarshalJSON([]byte(`""`)) + require.NoError(t, err) + assert.Equal(t, StringOrStringSlice{""}, s) +} + +func TestStringOrStringSlice_InvalidType_Number(t *testing.T) { + var s StringOrStringSlice + err := s.UnmarshalJSON([]byte(`123`)) + assert.Error(t, err) + assert.Contains(t, err.Error(), "neither a string nor a list of strings") +} + +func TestStringOrStringSlice_InvalidType_Boolean(t *testing.T) { + var s StringOrStringSlice + err := s.UnmarshalJSON([]byte(`true`)) + assert.Error(t, err) +} + +func TestStringOrStringSlice_InvalidType_Object(t *testing.T) { + var s StringOrStringSlice + err := s.UnmarshalJSON([]byte(`{"key":"value"}`)) + assert.Error(t, err) +} + +func TestStringOrStringSlice_Null(t *testing.T) { + var s StringOrStringSlice + err := s.UnmarshalJSON([]byte(`null`)) + // JSON null unmarshals into a string as "" (zero value) — so the first branch succeeds. + // The result is a slice containing an empty string. + require.NoError(t, err) + assert.Equal(t, StringOrStringSlice{""}, s) +}