Skip to content

Commit 89db0d6

Browse files
adonigAndreas Donig
andauthored
Add normalise_url parameter to Crawly.Middlewares.UniqueRequest (#295)
Co-authored-by: Andreas Donig <git@innwiese.de>
1 parent 598844b commit 89db0d6

File tree

2 files changed

+36
-7
lines changed

2 files changed

+36
-7
lines changed

lib/crawly/middlewares/unique_request.ex

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ defmodule Crawly.Middlewares.UniqueRequest do
1717
]
1818
```
1919
20+
```
21+
middlewares: [
22+
{Crawly.Middlewares.UniqueRequest, hash: :sha, normalise_url: fn url -> String.trim_trailing("/") end}
23+
]
24+
```
25+
2026
See the [Erlang documentation for crypto](https://www.erlang.org/doc/man/crypto.html#type-sha1)
2127
for available algorithms.
2228
"""
@@ -26,16 +32,24 @@ defmodule Crawly.Middlewares.UniqueRequest do
2632
unique_request_seen_requests =
2733
Map.get(state, :unique_request_seen_requests, %{})
2834

29-
# we assume that https://example/foo and https://example/foo/ refer to the same content,
30-
# in case they are both accessible
31-
normalised_url = request.url |> String.replace_suffix("/", "")
35+
normalised_url =
36+
case opts[:normalise_url] do
37+
nil ->
38+
# Assuming that trailing slashes do not affect the content.
39+
request.url |> String.trim_trailing("/")
40+
41+
normalise_url when is_function(normalise_url, 1) ->
42+
normalise_url.(request.url)
43+
44+
_ ->
45+
raise ArgumentError, "normalise_url must be a function with arity 1"
46+
end
3247

3348
# optionally hash the URL
3449
unique_hash =
35-
if algo = opts[:hash] do
36-
:crypto.hash(algo, normalised_url)
37-
else
38-
normalised_url
50+
case opts[:hash] do
51+
nil -> normalised_url
52+
algo -> :crypto.hash(algo, normalised_url)
3953
end
4054

4155
case Map.get(unique_request_seen_requests, unique_hash) do

test/middlewares/unique_request_test.exs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,19 @@ defmodule Middlewares.UniqueRequestTest do
3232
# run again, should drop the request
3333
assert {false, _state} = Crawly.Utils.pipe(middlewares, @valid_slash, state)
3434
end
35+
36+
test "Uses the normalise_url function if given" do
37+
middlewares = [
38+
{Crawly.Middlewares.UniqueRequest, normalise_url: fn url -> url end}
39+
]
40+
41+
state = %{spider_name: :test_spider, crawl_id: "123"}
42+
43+
assert {%Crawly.Request{}, state} =
44+
Crawly.Utils.pipe(middlewares, @valid, state)
45+
46+
# run again, should not drop the request, because normalise_url overrides default
47+
assert {%Crawly.Request{}, _state} =
48+
Crawly.Utils.pipe(middlewares, @valid_slash, state)
49+
end
3550
end

0 commit comments

Comments
 (0)