@@ -17,6 +17,12 @@ defmodule Crawly.Middlewares.UniqueRequest do
1717 ]
1818 ```
1919
20+ ```
21+ middlewares: [
22+ {Crawly.Middlewares.UniqueRequest, hash: :sha, normalise_url: fn url -> String.trim_trailing("/") end}
23+ ]
24+ ```
25+
2026 See the [Erlang documentation for crypto](https://www.erlang.org/doc/man/crypto.html#type-sha1)
2127 for available algorithms.
2228 """
@@ -26,16 +32,24 @@ defmodule Crawly.Middlewares.UniqueRequest do
2632 unique_request_seen_requests =
2733 Map . get ( state , :unique_request_seen_requests , % { } )
2834
29- # we assume that https://example/foo and https://example/foo/ refer to the same content,
30- # in case they are both accessible
31- normalised_url = request . url |> String . replace_suffix ( "/" , "" )
35+ normalised_url =
36+ case opts [ :normalise_url ] do
37+ nil ->
38+ # Assuming that trailing slashes do not affect the content.
39+ request . url |> String . trim_trailing ( "/" )
40+
41+ normalise_url when is_function ( normalise_url , 1 ) ->
42+ normalise_url . ( request . url )
43+
44+ _ ->
45+ raise ArgumentError , "normalise_url must be a function with arity 1"
46+ end
3247
3348 # optionally hash the URL
3449 unique_hash =
35- if algo = opts [ :hash ] do
36- :crypto . hash ( algo , normalised_url )
37- else
38- normalised_url
50+ case opts [ :hash ] do
51+ nil -> normalised_url
52+ algo -> :crypto . hash ( algo , normalised_url )
3953 end
4054
4155 case Map . get ( unique_request_seen_requests , unique_hash ) do
0 commit comments