Skip to content

Commit c1d41de

Browse files
Refactoring Batch Processing to allow obtaining subjects from PDF Files (#41)
…r from the PDF file instead of the DB <!-- markdownlint-disable-next-line first-line-heading --> ## Description <!-- Describe your changes in detail. --> Added a new feature in the batch_processing util to allow obtaining the NHS Number from the PDF file instead of the DB. This is done by adding True as an input variable if this feature is wanted. Currently this is demonstrated with the S1 batch in test_compartment_1.py ## Context <!-- Why is this change required? What problem does it solve? --> This gives us another way to obtain subjects from a batch that is closer to how an actual user would do so. ## Type of changes <!-- What types of changes does your code introduce? Put an `x` in all the boxes that apply. --> - [x] Refactoring (non-breaking change) - [x] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would change existing functionality) - [ ] Bug fix (non-breaking change which fixes an issue) ## Checklist <!-- Go over all the following points, and put an `x` in all the boxes that apply. --> - [x] I am familiar with the [contributing guidelines](https://github.com/nhs-england-tools/playwright-python-blueprint/blob/main/CONTRIBUTING.md) - [x] I have followed the code style of the project - [x] I have added tests to cover my changes (where appropriate) - [x] I have updated the documentation accordingly - [ ] This PR is a result of pair or mob programming --- ## Sensitive Information Declaration To ensure the utmost confidentiality and protect your and others privacy, we kindly ask you to NOT including [PII (Personal Identifiable Information) / PID (Personal Identifiable Data)](https://digital.nhs.uk/data-and-information/keeping-data-safe-and-benefitting-the-public) or any other sensitive data in this PR (Pull Request) and the codebase changes. We will remove any PR that do contain any sensitive information. We really appreciate your cooperation in this matter. - [x] I confirm that neither PII/PID nor sensitive data are included in this PR and the codebase changes.
1 parent 8f6c7e3 commit c1d41de

File tree

4 files changed

+46
-24
lines changed

4 files changed

+46
-24
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ pandas~=2.2.3
77
python-dotenv~=1.0.1
88
sqlalchemy>=2.0.38
99
jproperties~=2.1.2
10+
pypdf>=5.3.0

tests/smokescreen/test_compartment_1.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def test_compartment_1(page: Page, smokescreen_properties: dict) -> None:
8888
"Skipping S1 Pre-invitation (FIT) (digital leaflet) as no self referral invitations were generated"
8989
)
9090
batch_processing(
91-
page, "S1", "Pre-invitation (FIT)", "S9 - Pre-invitation Sent", True
91+
page, "S1", "Pre-invitation (FIT)", "S9 - Pre-invitation Sent", True, True
9292
)
9393

9494
# Print the batch of Invitation & Test Kit Letters - England

utils/batch_processing.py

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@
1010
from utils.screening_subject_page_searcher import verify_subject_event_status_by_nhs_no
1111
from utils.oracle.oracle_specific_functions import get_nhs_no_from_batch_id
1212
from utils.oracle.oracle import OracleDB
13+
from utils.pdf_reader import extract_nhs_no_from_pdf
1314
import os
1415
import pytest
1516
from playwright.sync_api import Page
1617
import logging
18+
import pandas as pd
1719

1820

1921
def batch_processing(
@@ -22,6 +24,7 @@ def batch_processing(
2224
batch_description: str,
2325
latest_event_status: str,
2426
run_timed_events: bool = False,
27+
get_subjects_from_pdf: bool = False,
2528
) -> None:
2629
"""
2730
This util is used to process batches. It expects the following inputs:
@@ -30,6 +33,7 @@ def batch_processing(
3033
- batch_description: This is the description of the batch. E.g. Pre-invitation (FIT)
3134
- latest_event_status: This is the status the subject will get updated to after the batch has been processed.
3235
- run_timed_events: This is an optional input that executes bcss_timed_events if set to True
36+
- get_subjects_from_pdf: This is an optial input to change the method of retrieving subjects from the batch from the Db to the PDF file.
3337
"""
3438
logging.info(f"Processing {batch_type} - {batch_description} batch")
3539
BasePage(page).click_main_menu_link()
@@ -39,8 +43,9 @@ def batch_processing(
3943

4044
batch_description_cells = page.locator(f"//td[text()='{batch_description}']")
4145

42-
if batch_description_cells.count() == 0 and page.locator(
43-
"td", has_text="No matching records found"
46+
if (
47+
batch_description_cells.count() == 0
48+
and page.locator("td", has_text="No matching records found").is_visible()
4449
):
4550
pytest.fail(f"No {batch_type} {batch_description} batch found")
4651

@@ -55,41 +60,31 @@ def batch_processing(
5560
logging.info(
5661
f"Successfully found open '{batch_type} - {batch_description}' batch"
5762
)
58-
try:
59-
logging.info(
60-
f"Attempting to get NHS Numbers for batch {link_text} from the DB"
61-
)
62-
nhs_no_df = get_nhs_no_from_batch_id(link_text)
63-
logging.info(
64-
f"Successfully retrieved NHS Numbers from batch {link_text}"
65-
)
66-
except Exception as e:
67-
pytest.fail(
68-
f"Failed to retrieve NHS Numbers from batch {link_text}, {str(e)}"
69-
)
7063
link.click()
7164
break
7265
elif (i + 1) == batch_description_cells.count():
7366
pytest.fail(f"No open '{batch_type} - {batch_description}' batch found")
7467

75-
prepare_and_print_batch(page, link_text)
68+
if get_subjects_from_pdf:
69+
logging.info(f"Getting NHS Numbers for batch {link_text} from the PDF File")
70+
nhs_no_df = prepare_and_print_batch(page, link_text, get_subjects_from_pdf)
71+
else:
72+
logging.info(f"Getting NHS Numbers for batch {link_text} from the DB")
73+
prepare_and_print_batch(page, link_text, get_subjects_from_pdf)
74+
nhs_no_df = get_nhs_no_from_batch_id(link_text)
7675

7776
check_batch_in_archived_batch_list(page, link_text)
7877

7978
first_nhs_no = nhs_no_df["subject_nhs_number"].iloc[0]
80-
try:
81-
verify_subject_event_status_by_nhs_no(page, first_nhs_no, latest_event_status)
82-
logging.info(
83-
f"Successfully verified NHS number {first_nhs_no} with status {latest_event_status}"
84-
)
85-
except Exception as e:
86-
pytest.fail(f"Verification failed for NHS number {first_nhs_no}: {str(e)}")
79+
verify_subject_event_status_by_nhs_no(page, first_nhs_no, latest_event_status)
8780

8881
if run_timed_events:
8982
OracleDB().exec_bcss_timed_events(nhs_no_df)
9083

9184

92-
def prepare_and_print_batch(page: Page, link_text) -> None:
85+
def prepare_and_print_batch(
86+
page: Page, link_text: str, get_subjects_from_pdf: bool = False
87+
) -> pd.DataFrame | None:
9388
"""
9489
This method prepares the batch, retreives the files and confirms them as printed
9590
Once those buttons have been pressed it waits for the message 'Batch Successfully Archived'
@@ -114,6 +109,11 @@ def prepare_and_print_batch(page: Page, link_text) -> None:
114109
file = download_file.suggested_filename
115110
# Wait for the download process to complete and save the downloaded file in a temp folder
116111
download_file.save_as(file)
112+
nhs_no_df = (
113+
extract_nhs_no_from_pdf(file)
114+
if file.endswith(".pdf") and get_subjects_from_pdf
115+
else None
116+
)
117117
os.remove(file) # Deletes the file after extracting the necessary data
118118
except Exception as e:
119119
pytest.fail(f"No retrieve button available to click: {str(e)}")
@@ -137,6 +137,8 @@ def prepare_and_print_batch(page: Page, link_text) -> None:
137137
except Exception as e:
138138
pytest.fail(f"Batch successfully archived message is not shown: {str(e)}")
139139

140+
return nhs_no_df
141+
140142

141143
def check_batch_in_archived_batch_list(page: Page, link_text) -> None:
142144
"""

utils/pdf_reader.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from pypdf import PdfReader
2+
import pandas as pd
3+
4+
5+
def extract_nhs_no_from_pdf(file: str) -> pd.DataFrame:
6+
reader = PdfReader(file)
7+
nhs_no_df = pd.DataFrame(columns=["subject_nhs_number"])
8+
# For loop looping through all pages of the file to find the NHS Number
9+
for i, pages in enumerate(reader.pages):
10+
text = pages.extract_text()
11+
if "NHS No" in text:
12+
# If NHS number is found split the text by every new line into a list
13+
text = text.splitlines(True)
14+
for split_text in text:
15+
if "NHS No" in split_text:
16+
# If a string is found containing "NHS No" only digits are stored into nhs_no
17+
nhs_no = "".join([ele for ele in split_text if ele.isdigit()])
18+
nhs_no_df.loc[i] = [nhs_no]
19+
return nhs_no_df

0 commit comments

Comments
 (0)