Skip to content

Commit ee13248

Browse files
authored
Add date range support to access control rules (#949)
* Implement access rule date support for before, after, newer , older * Add documentation
1 parent 162d770 commit ee13248

File tree

8 files changed

+161
-18
lines changed

8 files changed

+161
-18
lines changed

docs/manual/access-control.rst

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ An .aclj file may look as follows::
9595

9696
Each JSON entry contains an ``access`` field and the original ``url`` field that was used to convert to the SURT (if any).
9797

98-
The JSON entry may also contain a ``user`` field, as explained below.
98+
The JSON entry may also contain ``user``, ``before``, ``after``, ``newer``, and ``older`` fields, as explained in the sections below.
9999

100100
The prefix consists of a SURT key and a ``-`` (currently reserved for a timestamp/date range field to be added later).
101101

@@ -166,6 +166,41 @@ Further examples of how to set this header will be provided in the deployments s
166166
See the :ref:`config-acl-header` section in Usage for examples on how to configure this header.
167167

168168

169+
Date-Based Access Controls: Before/After Exact Date
170+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
171+
172+
It is also possible to control access based on capture timestamp, using ``before`` and ``after`` fields to specify an exact timestamp.
173+
174+
For example, the following access control settings restrict access to ``https://example.com/restricted/`` by default, but allow access for captures prior to December 1, 2010::
175+
176+
com,example)/restricted - {"access": "allow", "before": "20101201"}
177+
com,example)/restricted - {"access": "block"}
178+
179+
180+
Combined with the embargo settings, this can also be used to override the embargo for captures that fall within a particular time period, while keeping the embargo for general access::
181+
182+
com,example)/restricted - {"access": "allow_ignore_embargo", "before": "2010"}
183+
com,example)/restricted - {"access": "allow"}
184+
185+
186+
Date-Based Access Controls: Time Interval
187+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
188+
189+
Access can also be controlled by specifying a relative time interval, similar to embargos.
190+
191+
For example, the following access control settings restrict access to ``https://example.com/restricted/`` by default, but allow access to all captures newer than 1 year::
192+
193+
com,example)/restricted - {"access": "allow", "newer": {"years": 1}}
194+
com,example)/restricted - {"access": "block"}
195+
196+
The following access control settings restrict access to ``https://example.com/restricted/`` by default, but allow access to all captures older than 1 year, 2 months, 3 weeks, and 4 days::
197+
198+
com,example)/restricted - {"access": "allow", "older": {"years": 1, "months": 2, "weeks": 3, "days": 4}}
199+
com,example)/restricted - {"access": "block"}
200+
201+
Any combination of years, months, weeks and days can be used (as long as at least one is provided) for the ``newer`` or ``older`` access control settings.
202+
203+
169204
Access Error Messages
170205
^^^^^^^^^^^^^^^^^^^^^
171206

pywb/warcserver/access_checker.py

Lines changed: 62 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,53 @@ def check_embargo(self, url, ts):
173173
actual = datetime.now(timezone.utc) - older
174174
return access if actual > dt else None
175175

176+
def check_date_access(
177+
self, ts, access, default_access, rule
178+
):
179+
"""Return access based on date fields in access rule
180+
181+
If a date-based rule exists and condition is not met, return default rule
182+
If no date-based rule exists, return access
183+
"""
184+
if not rule:
185+
return access
186+
187+
dt = timestamp_to_datetime(ts, tz_aware=True)
188+
189+
before_ts = rule.get('before')
190+
if before_ts:
191+
before = timestamp_to_datetime(before_ts, tz_aware=True)
192+
return access if dt < before else default_access
193+
194+
after_ts = rule.get('after')
195+
if after_ts:
196+
after = timestamp_to_datetime(after_ts, tz_aware=True)
197+
return access if dt > after else default_access
198+
199+
newer = rule.get('newer')
200+
if newer:
201+
delta = relativedelta(
202+
years=newer.get('years', 0),
203+
months=newer.get('months', 0),
204+
weeks=newer.get('weeks', 0),
205+
days=newer.get('days', 0)
206+
)
207+
actual = datetime.now(timezone.utc) - delta
208+
return access if actual < dt else default_access
209+
210+
older = rule.get('older')
211+
if older:
212+
delta = relativedelta(
213+
years=older.get('years', 0),
214+
months=older.get('months', 0),
215+
weeks=older.get('weeks', 0),
216+
days=older.get('days', 0)
217+
)
218+
actual = datetime.now(timezone.utc) - delta
219+
return access if actual > dt else default_access
220+
221+
return access
222+
176223
def create_access_aggregator(self, source_files):
177224
"""Creates a new AccessRulesAggregator using the supplied list
178225
of access control file names
@@ -300,10 +347,7 @@ def wrap_iter(self, cdx_iter, acl_user):
300347
:param str acl_user: The user associated with this request (optional)
301348
:return: The wrapped cdx object iterator
302349
"""
303-
last_rule = None
304-
last_url = None
305-
last_user = None
306-
rule = None
350+
default_access = self.default_rule['access']
307351

308352
for cdx in cdx_iter:
309353
url = cdx.get('url')
@@ -314,19 +358,24 @@ def wrap_iter(self, cdx_iter, acl_user):
314358
yield cdx
315359
continue
316360

361+
rule = None
317362
access = None
363+
318364
if self.aggregator:
319-
# TODO: optimization until date range support is included
320-
if url == last_url and acl_user == last_user:
321-
rule = last_rule
322-
else:
323-
rule = self.find_access_rule(url, timestamp,
324-
cdx.get('urlkey'),
325-
cdx.get('source-coll'),
326-
acl_user)
365+
rule = self.find_access_rule(
366+
url,
367+
timestamp,
368+
cdx.get('urlkey'),
369+
cdx.get('source-coll'),
370+
acl_user
371+
)
327372

328373
access = rule.get('access', 'exclude')
329374

375+
access = self.check_date_access(
376+
timestamp, access, default_access, rule
377+
)
378+
330379
if access != 'allow_ignore_embargo' and access != 'exclude':
331380
embargo_access = self.check_embargo(url, timestamp)
332381
if embargo_access and embargo_access != 'allow':
@@ -336,14 +385,10 @@ def wrap_iter(self, cdx_iter, acl_user):
336385
continue
337386

338387
if not access:
339-
access = self.default_rule['access']
388+
access = default_access
340389

341390
if access == 'allow_ignore_embargo':
342391
access = 'allow'
343392

344393
cdx['access'] = access
345394
yield cdx
346-
347-
last_rule = rule
348-
last_url = url
349-
last_user = acl_user

sample_archive/access/after.aclj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
org,iana)/ - {"access": "allow", "url": "http://www.iana.org/", "after": "20140126"}

sample_archive/access/before.aclj

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
org,iana)/domains/arpa - {"access": "allow", "url": "http://www.iana.org/domains/arpa", "before": "20140127"}
2+
org,iana)/domains - {"access": "block", "url": "http://www.iana.org/domains"}
3+
org,iana)/ - {"access": "allow", "url": "http://www.iana.org/", "before": "20140126"}

sample_archive/access/newer.aclj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
org,iana)/ - {"access": "allow", "url": "http://www.iana.org/", "newer": {"years": 1, "months": 6}}

sample_archive/access/older.aclj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
org,iana)/ - {"access": "allow", "url": "http://www.iana.org/", "older": {"years": 1}}

tests/config_test_access.yaml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,34 @@ collections:
6262
acl_paths:
6363
- ./sample_archive/access/pywb.aclj
6464

65+
pywb-acl-before:
66+
index_paths: ./sample_archive/cdx/
67+
archive_paths: ./sample_archive/warcs/
68+
default_access: block
69+
acl_paths:
70+
- ./sample_archive/access/before.aclj
71+
72+
pywb-acl-after:
73+
index_paths: ./sample_archive/cdx/
74+
archive_paths: ./sample_archive/warcs/
75+
default_access: block
76+
acl_paths:
77+
- ./sample_archive/access/after.aclj
78+
79+
pywb-acl-newer:
80+
index_paths: ./sample_archive/cdx/
81+
archive_paths: ./sample_archive/warcs/
82+
default_access: block
83+
acl_paths:
84+
- ./sample_archive/access/newer.aclj
85+
86+
pywb-acl-older:
87+
index_paths: ./sample_archive/cdx/
88+
archive_paths: ./sample_archive/warcs/
89+
default_access: block
90+
acl_paths:
91+
- ./sample_archive/access/older.aclj
92+
6593
pywb-wildcard-surt:
6694
index_paths: ./sample_archive/cdx/
6795
archive_paths: ./sample_archive/warcs/

tests/test_acl.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,32 @@ def test_allow_all_acl_user_specific(self):
102102
assert 'Access Blocked' in resp.text
103103

104104
resp = self.testapp.get('/pywb-wildcard-surt/mp_/http://example.com/', headers={"X-Pywb-Acl-User": "staff"}, status=200)
105+
106+
def test_acl_before(self):
107+
resp = self.testapp.get('/pywb-acl-before/20140127171238mp_/http://www.iana.org/', status=451)
108+
assert 'Access Blocked' in resp.text
109+
110+
resp = self.testapp.get('/pywb-acl-before/20140126200624mp_/http://www.iana.org/', status=200)
111+
112+
resp = self.testapp.get('/pywb-acl-before/20140126200825mp_/http://www.iana.org/domains', status=451)
113+
assert 'Access Blocked' in resp.text
114+
115+
resp = self.testapp.get('/pywb-acl-before/20140126201248mp_/http://www.iana.org/domains/arpa', status=200)
116+
117+
def test_acl_after(self):
118+
resp = self.testapp.get('/pywb-acl-after/20140126200624mp_/http://www.iana.org/', status=451)
119+
assert 'Access Blocked' in resp.text
120+
121+
resp = self.testapp.get('/pywb-acl-after/20140127171238mp_/http://www.iana.org/', status=200)
122+
123+
def test_acl_newer(self):
124+
resp = self.testapp.get('/pywb-acl-newer/20140127171238mp_/http://www.iana.org/', status=451)
125+
assert 'Access Blocked' in resp.text
126+
127+
resp = self.testapp.get('/pywb-acl-newer/20140126200624mp_/http://www.iana.org/', status=451)
128+
assert 'Access Blocked' in resp.text
129+
130+
def test_acl_older(self):
131+
resp = self.testapp.get('/pywb-acl-older/20140127171238mp_/http://www.iana.org/', status=200)
132+
133+
resp = self.testapp.get('/pywb-acl-older/20140126200624mp_/http://www.iana.org/', status=200)

0 commit comments

Comments
 (0)