@@ -62,11 +62,21 @@ def run_pipeline_for_test(test_dirs, dataset, resource, request_id, input_path):
6262 "reference" : "ABC_0001" ,
6363 "entity" : 1 ,
6464 }
65+ row2 = {
66+ "prefix" : "article-4-direction" ,
67+ "resource" : "" ,
68+ "entry-number" : "" ,
69+ "organisation" : organisation ,
70+ "reference" : "a4d2" ,
71+ "entity" : 10 ,
72+ }
73+
6574 fieldnames = row .keys ()
6675 with open (os .path .join (pipeline_dir , "lookup.csv" ), "w" ) as f :
6776 dictwriter = csv .DictWriter (f , fieldnames = fieldnames )
6877 dictwriter .writeheader ()
6978 dictwriter .writerow (row )
79+ dictwriter .writerow (row2 )
7080
7181 # Create organisation.csv with data
7282 row = {"organisation" : "test-org" , "name" : "Test Org" }
@@ -165,7 +175,7 @@ def run_pipeline_for_test(test_dirs, dataset, resource, request_id, input_path):
165175 field_typology_map = specification .get_field_typology_map (),
166176 field_prefix_map = specification .get_field_prefix_map (),
167177 ),
168- FactLookupPhase (lookups ),
178+ FactLookupPhase (lookups , issue_log = issue_log ),
169179 FactPrunePhase (),
170180 SavePhase (
171181 output_path ,
@@ -188,6 +198,7 @@ def run_pipeline_for_test(test_dirs, dataset, resource, request_id, input_path):
188198 return {
189199 "output_path" : output_path ,
190200 "issue_log" : issue_log_path ,
201+ "save_issue_log" : issue_log ,
191202 "column_field_log" : column_field_log_path ,
192203 "dataset_resource_log" : dataset_resource_log_path ,
193204 }
@@ -408,3 +419,98 @@ def test_column_field_log_creation(test_dirs):
408419 assert (
409420 "field" in column_field_log_df .columns
410421 ), "Column field log is missing 'field' column."
422+
423+
424+ def test_pipeline_lookup_phase (test_dirs ):
425+ dataset = "article-4-direction-area"
426+ resource = "5158d13bfc6f0723b1fb07c975701a906e83a1ead4aee598ee34e241c79a5f3d"
427+ request_id = "test_request_id"
428+
429+ rows = [
430+ {
431+ "reference" : "ABC_0001" ,
432+ "entry-date" : "2025-01-01" ,
433+ "organisation" : "test-org" ,
434+ "article-4-direction" : "a4d1" ,
435+ }
436+ ]
437+
438+ input_path = os .path .join (test_dirs ["collection_dir" ], resource )
439+
440+ with open (input_path , "w" , newline = "" ) as f :
441+ fieldnames = ["reference" , "entry-date" , "organisation" , "article-4-direction" ]
442+ writer = csv .DictWriter (f , fieldnames = fieldnames )
443+ writer .writeheader ()
444+ for row in rows :
445+ writer .writerow (row )
446+
447+ # Run the pipeline
448+ run_pipeline = run_pipeline_for_test (
449+ test_dirs = test_dirs ,
450+ dataset = dataset ,
451+ resource = resource ,
452+ request_id = request_id ,
453+ input_path = input_path ,
454+ )
455+ # issue_log
456+ issue_log = run_pipeline .get ("save_issue_log" )
457+ issue_log_path = run_pipeline .get ("issue_log" )
458+ issue_dict = next (
459+ (
460+ issue
461+ for issue in issue_log .rows
462+ if issue .get ("field" ) == "article-4-direction"
463+ ),
464+ None ,
465+ )
466+
467+ assert os .path .exists (issue_log_path )
468+ assert "no associated documents found for this area" == issue_dict ["issue-type" ]
469+ assert "a4d1" == issue_dict ["value" ]
470+
471+
472+ def test_pipeline_lookup_phase_assign_reference_entity (test_dirs ):
473+ dataset = "article-4-direction-area"
474+ resource = "5158d13bfc6f0723b1fb07c975701a906e83a1ead4aee598ee34e241c79a5f3d"
475+ request_id = "test_request_id"
476+
477+ rows = [
478+ {
479+ "reference" : "ABC_0001" ,
480+ "entry-date" : "2025-01-01" ,
481+ "organisation" : "test-org" ,
482+ "article-4-direction" : "a4d2" ,
483+ }
484+ ]
485+
486+ input_path = os .path .join (test_dirs ["collection_dir" ], resource )
487+
488+ with open (input_path , "w" , newline = "" ) as f :
489+ fieldnames = ["reference" , "entry-date" , "organisation" , "article-4-direction" ]
490+ writer = csv .DictWriter (f , fieldnames = fieldnames )
491+ writer .writeheader ()
492+ for row in rows :
493+ writer .writerow (row )
494+
495+ # Run the pipeline
496+ run_pipeline = run_pipeline_for_test (
497+ test_dirs = test_dirs ,
498+ dataset = dataset ,
499+ resource = resource ,
500+ request_id = request_id ,
501+ input_path = input_path ,
502+ )
503+ # issue_log
504+ issue_log = run_pipeline .get ("save_issue_log" )
505+
506+ # assert given error does not exist in issue_log
507+ assert all (
508+ issue .get ("issue-type" ) != "no associated documents found for this area"
509+ for issue in issue_log .rows
510+ )
511+
512+ output_path = run_pipeline .get ("output_path" )
513+ output_df = pd .read_csv (output_path )
514+
515+ assert os .path .exists (output_path )
516+ assert 10.0 in output_df ["reference-entity" ].values
0 commit comments