Skip to content

Commit b657658

Browse files
authored
Merge pull request #345 from digital-land/referential-check-issue-type
Add issue type to check referential integrity for linked datasets
2 parents ad7d52a + 23d14a5 commit b657658

File tree

5 files changed

+157
-0
lines changed

5 files changed

+157
-0
lines changed

digital_land/cli.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
column_field_dir,
4545
converted_resource_dir,
4646
output_log_dir,
47+
provision_summary_dir,
4748
)
4849

4950

@@ -227,6 +228,7 @@ def dataset_dump_flattened_cmd(ctx, input_path, output_path):
227228
@dataset_resource_dir
228229
@converted_resource_dir
229230
@organisation_path
231+
@provision_summary_dir
230232
@collection_dir
231233
@operational_issue_dir
232234
@output_log_dir
@@ -250,6 +252,7 @@ def pipeline_command(
250252
config_path,
251253
resource,
252254
output_log_dir,
255+
provision_summary_dir,
253256
):
254257
dataset = ctx.obj["DATASET"]
255258
pipeline = ctx.obj["PIPELINE"]
@@ -279,6 +282,7 @@ def pipeline_command(
279282
config_path=config_path,
280283
resource=resource,
281284
output_log_dir=output_log_dir,
285+
provision_summary_dir=provision_summary_dir,
282286
)
283287

284288

digital_land/command_arguments.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,11 @@ def organisation_path(f):
9797
type=click.Path(exists=True),
9898
default="var/cache/organisation.csv",
9999
)(f)
100+
101+
102+
def provision_summary_dir(f):
103+
return click.option(
104+
"--provision-summary-dir",
105+
type=click.Path(exists=True),
106+
default="var/cache/provision-summary/",
107+
)(f)

digital_land/commands.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ def pipeline_run(
216216
resource=None,
217217
output_log_dir=None,
218218
converted_path=None,
219+
provision_summary_dir="var/cache/provision_summary",
219220
):
220221
# set up paths
221222
cache_dir = Path(cache_dir)
@@ -330,6 +331,7 @@ def pipeline_run(
330331
issue_log=issue_log,
331332
operational_issue_log=operational_issue_log,
332333
entity_range=[entity_range_min, entity_range_max],
334+
provision_summary_dir=provision_summary_dir,
333335
),
334336
SavePhase(
335337
default_output_path("harmonised", input_path),

digital_land/phase/lookup.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import re
22
import logging
3+
import pandas as pd
34

45
from .phase import Phase
56

@@ -33,13 +34,15 @@ def __init__(
3334
issue_log=None,
3435
operational_issue_log=None,
3536
entity_range=[],
37+
provision_summary_dir=None,
3638
):
3739
self.lookups = lookups
3840
self.redirect_lookups = redirect_lookups
3941
self.issues = issue_log
4042
self.operational_issues = operational_issue_log
4143
self.reverse_lookups = self.build_reverse_lookups()
4244
self.entity_range = entity_range
45+
self.provision_summary_dir = provision_summary_dir
4346

4447
def build_reverse_lookups(self):
4548
reverse_lookups = {}
@@ -163,6 +166,47 @@ def process(self, stream):
163166
row[self.entity_field] = self.redirect_entity(
164167
row[self.entity_field]
165168
)
169+
170+
linked_datasets = ["article-4-direction", "tree-preservation-order"]
171+
if row[self.entity_field]:
172+
for linked_dataset in linked_datasets:
173+
if (
174+
row.get(linked_dataset, "")
175+
or row.get(linked_dataset, "").strip()
176+
):
177+
get_organisations = pd.read_csv(
178+
self.provision_summary_dir
179+
+ "/"
180+
+ linked_dataset
181+
+ ".csv"
182+
)
183+
184+
if (
185+
row.get("organisation", "")
186+
in get_organisations["organisation"].values
187+
):
188+
reference = row.get(linked_dataset, "")
189+
find_entity = self.lookup(
190+
prefix=linked_dataset,
191+
organisation=row.get("organisation", ""),
192+
reference=reference,
193+
)
194+
# raise issue if the found entity is retired in old-entity.csv
195+
if not find_entity or (
196+
str(find_entity) in self.redirect_lookups
197+
and int(
198+
self.redirect_lookups[str(find_entity)].get(
199+
"status", 0
200+
)
201+
)
202+
== 410
203+
):
204+
self.issues.log_issue(
205+
linked_dataset,
206+
"no associated documents found for this area",
207+
reference,
208+
line_number=line_number,
209+
)
166210
yield block
167211

168212

tests/unit/phase/test_lookup.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import pandas as pd
12
import pytest
23

34
from digital_land.phase.lookup import LookupPhase, EntityLookupPhase, PrintLookupPhase
@@ -19,6 +20,22 @@ def get_input_stream():
1920
]
2021

2122

23+
@pytest.fixture
24+
def get_input_stream_with_linked_field():
25+
return [
26+
{
27+
"row": {
28+
"prefix": "article-4-direction-area",
29+
"reference": "1",
30+
"organisation": "local-authority:ABC",
31+
"article-4-direction": "a4d2",
32+
},
33+
"entry-number": 1,
34+
"line-number": 2,
35+
}
36+
]
37+
38+
2239
@pytest.fixture
2340
def get_lookup():
2441
return {",dataset,1,test": "1"}
@@ -123,6 +140,88 @@ def test_process_empty_prefix(self, get_lookup):
123140

124141
assert output[0]["row"]["entity"] == "10"
125142

143+
def test_no_associated_documents_issue(
144+
self, get_input_stream_with_linked_field, mocker
145+
):
146+
input_stream = get_input_stream_with_linked_field
147+
148+
lookups = {
149+
",article-4-direction,a4d1,local-authorityabc": "1",
150+
",article-4-direction-area,1,local-authorityabc": "2",
151+
}
152+
issues = IssueLog()
153+
154+
phase = LookupPhase(
155+
lookups=lookups,
156+
issue_log=issues,
157+
provision_summary_dir="var/cache/provision-summary/",
158+
)
159+
phase.entity_field = "entity"
160+
mock_df = pd.DataFrame({"organisation": ["local-authority:ABC"]})
161+
mocker.patch("pandas.read_csv", return_value=mock_df)
162+
output = [block for block in phase.process(input_stream)]
163+
164+
assert output[0]["row"]["entity"] == "2"
165+
assert (
166+
issues.rows[0]["issue-type"]
167+
== "no associated documents found for this area"
168+
)
169+
assert issues.rows[0]["value"] == "a4d2"
170+
171+
def test_no_associated_documents_issue_for_missing_dataset(
172+
self, get_input_stream_with_linked_field, mocker
173+
):
174+
input_stream = get_input_stream_with_linked_field
175+
176+
lookups = {
177+
",article-4-direction,a4d1,local-authorityabc": "1",
178+
",article-4-direction-area,1,local-authorityabc": "2",
179+
}
180+
issues = IssueLog()
181+
182+
phase = LookupPhase(
183+
lookups=lookups,
184+
issue_log=issues,
185+
provision_summary_dir="var/cache/provision-summary/",
186+
)
187+
phase.entity_field = "entity"
188+
mock_df = pd.DataFrame({"organisation": ["local-authority:XYZ"]})
189+
mocker.patch("pandas.read_csv", return_value=mock_df)
190+
output = [block for block in phase.process(input_stream)]
191+
192+
assert output[0]["row"]["entity"] == "2"
193+
assert len(issues.rows) == 0
194+
195+
def test_no_associated_documents_issue_for_retired_entity(
196+
self, get_input_stream_with_linked_field, mocker
197+
):
198+
input_stream = get_input_stream_with_linked_field
199+
200+
lookups = {
201+
",article-4-direction,a4d2,local-authorityabc": "1",
202+
",article-4-direction-area,1,local-authorityabc": "2",
203+
}
204+
issues = IssueLog()
205+
redirect_lookups = {"1": {"entity": "", "status": "410"}}
206+
207+
phase = LookupPhase(
208+
lookups=lookups,
209+
redirect_lookups=redirect_lookups,
210+
issue_log=issues,
211+
provision_summary_dir="var/cache/provision-summary/",
212+
)
213+
phase.entity_field = "entity"
214+
mock_df = pd.DataFrame({"organisation": ["local-authority:ABC"]})
215+
mocker.patch("pandas.read_csv", return_value=mock_df)
216+
output = [block for block in phase.process(input_stream)]
217+
218+
assert output[0]["row"]["entity"] == "2"
219+
assert (
220+
issues.rows[0]["issue-type"]
221+
== "no associated documents found for this area"
222+
)
223+
assert issues.rows[0]["value"] == "a4d2"
224+
126225

127226
class TestPrintLookupPhase:
128227
def test_process_does_not_produce_new_lookup(self, get_input_stream, get_lookup):

0 commit comments

Comments
 (0)