Refactoring Batch Processing to allow obtaining subjects from PDF Files (#41)

adrianoaru-nhs · web-flow · commit c1d41de851a6 · 2025-05-01T12:42:59.000+01:00
…r from the PDF file instead of the DB  ## Description  Added a new feature in the batch_processing util to allow obtaining the NHS Number from the PDF file instead of the DB. This is done by adding True as an input variable if this feature is wanted. Currently this is demonstrated with the S1 batch in test_compartment_1.py ## Context  This gives us another way to obtain subjects from a batch that is closer to how an actual user would do so. ## Type of changes  - [x] Refactoring (non-breaking change) - [x] New feature (non-breaking change which adds functionality) - [ ] Breaking change (fix or feature that would change existing functionality) - [ ] Bug fix (non-breaking change which fixes an issue) ## Checklist  - [x] I am familiar with the [contributing guidelines](https://github.com/nhs-england-tools/playwright-python-blueprint/blob/main/CONTRIBUTING.md) - [x] I have followed the code style of the project - [x] I have added tests to cover my changes (where appropriate) - [x] I have updated the documentation accordingly - [ ] This PR is a result of pair or mob programming --- ## Sensitive Information Declaration To ensure the utmost confidentiality and protect your and others privacy, we kindly ask you to NOT including [PII (Personal Identifiable Information) / PID (Personal Identifiable Data)](https://digital.nhs.uk/data-and-information/keeping-data-safe-and-benefitting-the-public) or any other sensitive data in this PR (Pull Request) and the codebase changes. We will remove any PR that do contain any sensitive information. We really appreciate your cooperation in this matter. - [x] I confirm that neither PII/PID nor sensitive data are included in this PR and the codebase changes.
diff --git a/requirements.txt b/requirements.txt
@@ -7,3 +7,4 @@ pandas~=2.2.3
 python-dotenv~=1.0.1
 sqlalchemy>=2.0.38
 jproperties~=2.1.2
+pypdf>=5.3.0
diff --git a/tests/smokescreen/test_compartment_1.py b/tests/smokescreen/test_compartment_1.py
@@ -88,7 +88,7 @@ def test_compartment_1(page: Page, smokescreen_properties: dict) -> None:
             "Skipping S1 Pre-invitation (FIT) (digital leaflet) as no self referral invitations were generated"
         )
     batch_processing(
-        page, "S1", "Pre-invitation (FIT)", "S9 - Pre-invitation Sent", True
+        page, "S1", "Pre-invitation (FIT)", "S9 - Pre-invitation Sent", True, True
     )
 
     # Print the batch of Invitation & Test Kit Letters - England
diff --git a/utils/batch_processing.py b/utils/batch_processing.py
@@ -10,10 +10,12 @@
 from utils.screening_subject_page_searcher import verify_subject_event_status_by_nhs_no
 from utils.oracle.oracle_specific_functions import get_nhs_no_from_batch_id
 from utils.oracle.oracle import OracleDB
+from utils.pdf_reader import extract_nhs_no_from_pdf
 import os
 import pytest
 from playwright.sync_api import Page
 import logging
+import pandas as pd
 
 
 def batch_processing(
@@ -22,6 +24,7 @@ def batch_processing(
     batch_description: str,
     latest_event_status: str,
     run_timed_events: bool = False,
+    get_subjects_from_pdf: bool = False,
 ) -> None:
     """
     This util is used to process batches. It expects the following inputs:
@@ -30,6 +33,7 @@ def batch_processing(
     - batch_description: This is the description of the batch. E.g. Pre-invitation (FIT)
     - latest_event_status: This is the status the subject will get updated to after the batch has been processed.
     - run_timed_events: This is an optional input that executes bcss_timed_events if set to True
+    - get_subjects_from_pdf: This is an optial input to change the method of retrieving subjects from the batch from the Db to the PDF file.
     """
     logging.info(f"Processing {batch_type} - {batch_description} batch")
     BasePage(page).click_main_menu_link()
@@ -39,8 +43,9 @@ def batch_processing(
 
     batch_description_cells = page.locator(f"//td[text()='{batch_description}']")
 
-    if batch_description_cells.count() == 0 and page.locator(
-        "td", has_text="No matching records found"
+    if (
+        batch_description_cells.count() == 0
+        and page.locator("td", has_text="No matching records found").is_visible()
     ):
         pytest.fail(f"No {batch_type} {batch_description} batch found")
 
@@ -55,41 +60,31 @@ def batch_processing(
             logging.info(
                 f"Successfully found open '{batch_type} - {batch_description}' batch"
             )
-            try:
-                logging.info(
-                    f"Attempting to get NHS Numbers for batch {link_text} from the DB"
-                )
-                nhs_no_df = get_nhs_no_from_batch_id(link_text)
-                logging.info(
-                    f"Successfully retrieved NHS Numbers from batch {link_text}"
-                )
-            except Exception as e:
-                pytest.fail(
-                    f"Failed to retrieve NHS Numbers from batch {link_text}, {str(e)}"
-                )
             link.click()
             break
         elif (i + 1) == batch_description_cells.count():
             pytest.fail(f"No open '{batch_type} - {batch_description}' batch found")
 
-    prepare_and_print_batch(page, link_text)
+    if get_subjects_from_pdf:
+        logging.info(f"Getting NHS Numbers for batch {link_text} from the PDF File")
+        nhs_no_df = prepare_and_print_batch(page, link_text, get_subjects_from_pdf)
+    else:
+        logging.info(f"Getting NHS Numbers for batch {link_text} from the DB")
+        prepare_and_print_batch(page, link_text, get_subjects_from_pdf)
+        nhs_no_df = get_nhs_no_from_batch_id(link_text)
 
     check_batch_in_archived_batch_list(page, link_text)
 
     first_nhs_no = nhs_no_df["subject_nhs_number"].iloc[0]
-    try:
-        verify_subject_event_status_by_nhs_no(page, first_nhs_no, latest_event_status)
-        logging.info(
-            f"Successfully verified NHS number {first_nhs_no} with status {latest_event_status}"
-        )
-    except Exception as e:
-        pytest.fail(f"Verification failed for NHS number {first_nhs_no}: {str(e)}")
+    verify_subject_event_status_by_nhs_no(page, first_nhs_no, latest_event_status)
 
     if run_timed_events:
         OracleDB().exec_bcss_timed_events(nhs_no_df)
 
 
-def prepare_and_print_batch(page: Page, link_text) -> None:
+def prepare_and_print_batch(
+    page: Page, link_text: str, get_subjects_from_pdf: bool = False
+) -> pd.DataFrame | None:
     """
     This method prepares the batch, retreives the files and confirms them as printed
     Once those buttons have been pressed it waits for the message 'Batch Successfully Archived'
@@ -114,6 +109,11 @@ def prepare_and_print_batch(page: Page, link_text) -> None:
             file = download_file.suggested_filename
             # Wait for the download process to complete and save the downloaded file in a temp folder
             download_file.save_as(file)
+            nhs_no_df = (
+                extract_nhs_no_from_pdf(file)
+                if file.endswith(".pdf") and get_subjects_from_pdf
+                else None
+            )
             os.remove(file)  # Deletes the file after extracting the necessary data
     except Exception as e:
         pytest.fail(f"No retrieve button available to click: {str(e)}")
@@ -137,6 +137,8 @@ def prepare_and_print_batch(page: Page, link_text) -> None:
     except Exception as e:
         pytest.fail(f"Batch successfully archived message is not shown: {str(e)}")
 
+    return nhs_no_df
+
 
 def check_batch_in_archived_batch_list(page: Page, link_text) -> None:
     """
diff --git a/utils/pdf_reader.py b/utils/pdf_reader.py
@@ -0,0 +1,19 @@
+from pypdf import PdfReader
+import pandas as pd
+
+
+def extract_nhs_no_from_pdf(file: str) -> pd.DataFrame:
+    reader = PdfReader(file)
+    nhs_no_df = pd.DataFrame(columns=["subject_nhs_number"])
+    # For loop looping through all pages of the file to find the NHS Number
+    for i, pages in enumerate(reader.pages):
+        text = pages.extract_text()
+        if "NHS No" in text:
+            # If NHS number is found split the text by every new line into a list
+            text = text.splitlines(True)
+            for split_text in text:
+                if "NHS No" in split_text:
+                    # If a string is found containing "NHS No" only digits are stored into nhs_no
+                    nhs_no = "".join([ele for ele in split_text if ele.isdigit()])
+                    nhs_no_df.loc[i] = [nhs_no]
+    return nhs_no_df

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ def test_compartment_1(page: Page, smokescreen_properties: dict) -> None:`
`88`	`88`	`"Skipping S1 Pre-invitation (FIT) (digital leaflet) as no self referral invitations were generated"`
`89`	`89`	`)`
`90`	`90`	`batch_processing(`
`91`		`- page, "S1", "Pre-invitation (FIT)", "S9 - Pre-invitation Sent", True`
	`91`	`+ page, "S1", "Pre-invitation (FIT)", "S9 - Pre-invitation Sent", True, True`
`92`	`92`	`)`
`93`	`93`
`94`	`94`	`# Print the batch of Invitation & Test Kit Letters - England`