Skip to content

Commit ec5ae26

Browse files
Adding a new otion in batch processing to allow getting the NHS Number from the PDF file instead of the DB
1 parent 98c6bf0 commit ec5ae26

File tree

4 files changed

+41
-23
lines changed

4 files changed

+41
-23
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ pandas~=2.2.3
77
python-dotenv~=1.0.1
88
sqlalchemy>=2.0.38
99
jproperties~=2.1.2
10+
pypdf>=5.3.0

tests/smokescreen/test_compartment_1.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def test_compartment_1(page: Page, smokescreen_properties: dict) -> None:
8888
"Skipping S1 Pre-invitation (FIT) (digital leaflet) as no self referral invitations were generated"
8989
)
9090
batch_processing(
91-
page, "S1", "Pre-invitation (FIT)", "S9 - Pre-invitation Sent", True
91+
page, "S1", "Pre-invitation (FIT)", "S9 - Pre-invitation Sent", True, True
9292
)
9393

9494
# Print the batch of Invitation & Test Kit Letters - England

utils/batch_processing.py

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@
1010
from utils.screening_subject_page_searcher import verify_subject_event_status_by_nhs_no
1111
from utils.oracle.oracle_specific_functions import get_nhs_no_from_batch_id
1212
from utils.oracle.oracle import OracleDB
13+
from utils.pdf_reader import extract_nhs_no_from_pdf
1314
import os
1415
import pytest
1516
from playwright.sync_api import Page
1617
import logging
18+
import pandas as pd
1719

1820

1921
def batch_processing(
@@ -22,6 +24,7 @@ def batch_processing(
2224
batch_description: str,
2325
latest_event_status: str,
2426
run_timed_events: bool = False,
27+
get_subjects_from_pdf: bool = False,
2528
) -> None:
2629
"""
2730
This util is used to process batches. It expects the following inputs:
@@ -39,8 +42,9 @@ def batch_processing(
3942

4043
batch_description_cells = page.locator(f"//td[text()='{batch_description}']")
4144

42-
if batch_description_cells.count() == 0 and page.locator(
43-
"td", has_text="No matching records found"
45+
if (
46+
batch_description_cells.count() == 0
47+
and page.locator("td", has_text="No matching records found").is_visible()
4448
):
4549
pytest.fail(f"No {batch_type} {batch_description} batch found")
4650

@@ -55,41 +59,31 @@ def batch_processing(
5559
logging.info(
5660
f"Successfully found open '{batch_type} - {batch_description}' batch"
5761
)
58-
try:
59-
logging.info(
60-
f"Attempting to get NHS Numbers for batch {link_text} from the DB"
61-
)
62+
if not get_subjects_from_pdf:
63+
logging.info(f"Getting NHS Numbers for batch {link_text} from the DB")
6264
nhs_no_df = get_nhs_no_from_batch_id(link_text)
63-
logging.info(
64-
f"Successfully retrieved NHS Numbers from batch {link_text}"
65-
)
66-
except Exception as e:
67-
pytest.fail(
68-
f"Failed to retrieve NHS Numbers from batch {link_text}, {str(e)}"
69-
)
7065
link.click()
7166
break
7267
elif (i + 1) == batch_description_cells.count():
7368
pytest.fail(f"No open '{batch_type} - {batch_description}' batch found")
7469

75-
prepare_and_print_batch(page, link_text)
70+
if get_subjects_from_pdf:
71+
nhs_no_df = prepare_and_print_batch(page, link_text, get_subjects_from_pdf)
72+
else:
73+
prepare_and_print_batch(page, link_text, get_subjects_from_pdf)
7674

7775
check_batch_in_archived_batch_list(page, link_text)
7876

7977
first_nhs_no = nhs_no_df["subject_nhs_number"].iloc[0]
80-
try:
81-
verify_subject_event_status_by_nhs_no(page, first_nhs_no, latest_event_status)
82-
logging.info(
83-
f"Successfully verified NHS number {first_nhs_no} with status {latest_event_status}"
84-
)
85-
except Exception as e:
86-
pytest.fail(f"Verification failed for NHS number {first_nhs_no}: {str(e)}")
78+
verify_subject_event_status_by_nhs_no(page, first_nhs_no, latest_event_status)
8779

8880
if run_timed_events:
8981
OracleDB().exec_bcss_timed_events(nhs_no_df)
9082

9183

92-
def prepare_and_print_batch(page: Page, link_text) -> None:
84+
def prepare_and_print_batch(
85+
page: Page, link_text: str, get_subjects_from_pdf: bool = False
86+
) -> pd.DataFrame | None:
9387
"""
9488
This method prepares the batch, retreives the files and confirms them as printed
9589
Once those buttons have been pressed it waits for the message 'Batch Successfully Archived'
@@ -114,6 +108,8 @@ def prepare_and_print_batch(page: Page, link_text) -> None:
114108
file = download_file.suggested_filename
115109
# Wait for the download process to complete and save the downloaded file in a temp folder
116110
download_file.save_as(file)
111+
if file.endswith(".pdf") and get_subjects_from_pdf:
112+
nhs_no_df = extract_nhs_no_from_pdf(file)
117113
os.remove(file) # Deletes the file after extracting the necessary data
118114
except Exception as e:
119115
pytest.fail(f"No retrieve button available to click: {str(e)}")
@@ -137,6 +133,8 @@ def prepare_and_print_batch(page: Page, link_text) -> None:
137133
except Exception as e:
138134
pytest.fail(f"Batch successfully archived message is not shown: {str(e)}")
139135

136+
return nhs_no_df if "nhs_no_df" in locals() else None
137+
140138

141139
def check_batch_in_archived_batch_list(page: Page, link_text) -> None:
142140
"""

utils/pdf_reader.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from pypdf import PdfReader
2+
import pandas as pd
3+
4+
5+
def extract_nhs_no_from_pdf(file: str) -> pd.DataFrame:
6+
reader = PdfReader(file)
7+
nhs_no_df = pd.DataFrame(columns=["subject_nhs_number"])
8+
# For loop looping through all pages of the file to find the NHS Number
9+
for i, pages in enumerate(reader.pages):
10+
text = pages.extract_text()
11+
if "NHS No" in text:
12+
# If NHS number is found split the text by every new line into a list
13+
text = text.splitlines(True)
14+
for split_text in text:
15+
if "NHS No" in split_text:
16+
# If a string is found containing "NHS No" only digits are stored into nhs_no
17+
nhs_no = "".join([ele for ele in split_text if ele.isdigit()])
18+
nhs_no_df.loc[i] = [nhs_no]
19+
return nhs_no_df

0 commit comments

Comments
 (0)