Adding a new otion in batch processing to allow getting the NHS Number from the PDF file instead of the DB

adrianoaru-nhs · adrianoaru-nhs · commit ec5ae262be83 · 2025-04-30T16:44:40.000+01:00
diff --git a/requirements.txt b/requirements.txt
@@ -7,3 +7,4 @@ pandas~=2.2.3
 python-dotenv~=1.0.1
 sqlalchemy>=2.0.38
 jproperties~=2.1.2
+pypdf>=5.3.0
diff --git a/tests/smokescreen/test_compartment_1.py b/tests/smokescreen/test_compartment_1.py
@@ -88,7 +88,7 @@ def test_compartment_1(page: Page, smokescreen_properties: dict) -> None:
             "Skipping S1 Pre-invitation (FIT) (digital leaflet) as no self referral invitations were generated"
         )
     batch_processing(
-        page, "S1", "Pre-invitation (FIT)", "S9 - Pre-invitation Sent", True
+        page, "S1", "Pre-invitation (FIT)", "S9 - Pre-invitation Sent", True, True
     )
 
     # Print the batch of Invitation & Test Kit Letters - England
diff --git a/utils/batch_processing.py b/utils/batch_processing.py
@@ -10,10 +10,12 @@
 from utils.screening_subject_page_searcher import verify_subject_event_status_by_nhs_no
 from utils.oracle.oracle_specific_functions import get_nhs_no_from_batch_id
 from utils.oracle.oracle import OracleDB
+from utils.pdf_reader import extract_nhs_no_from_pdf
 import os
 import pytest
 from playwright.sync_api import Page
 import logging
+import pandas as pd
 
 
 def batch_processing(
@@ -22,6 +24,7 @@ def batch_processing(
     batch_description: str,
     latest_event_status: str,
     run_timed_events: bool = False,
+    get_subjects_from_pdf: bool = False,
 ) -> None:
     """
     This util is used to process batches. It expects the following inputs:
@@ -39,8 +42,9 @@ def batch_processing(
 
     batch_description_cells = page.locator(f"//td[text()='{batch_description}']")
 
-    if batch_description_cells.count() == 0 and page.locator(
-        "td", has_text="No matching records found"
+    if (
+        batch_description_cells.count() == 0
+        and page.locator("td", has_text="No matching records found").is_visible()
     ):
         pytest.fail(f"No {batch_type} {batch_description} batch found")
 
@@ -55,41 +59,31 @@ def batch_processing(
             logging.info(
                 f"Successfully found open '{batch_type} - {batch_description}' batch"
             )
-            try:
-                logging.info(
-                    f"Attempting to get NHS Numbers for batch {link_text} from the DB"
-                )
+            if not get_subjects_from_pdf:
+                logging.info(f"Getting NHS Numbers for batch {link_text} from the DB")
                 nhs_no_df = get_nhs_no_from_batch_id(link_text)
-                logging.info(
-                    f"Successfully retrieved NHS Numbers from batch {link_text}"
-                )
-            except Exception as e:
-                pytest.fail(
-                    f"Failed to retrieve NHS Numbers from batch {link_text}, {str(e)}"
-                )
             link.click()
             break
         elif (i + 1) == batch_description_cells.count():
             pytest.fail(f"No open '{batch_type} - {batch_description}' batch found")
 
-    prepare_and_print_batch(page, link_text)
+    if get_subjects_from_pdf:
+        nhs_no_df = prepare_and_print_batch(page, link_text, get_subjects_from_pdf)
+    else:
+        prepare_and_print_batch(page, link_text, get_subjects_from_pdf)
 
     check_batch_in_archived_batch_list(page, link_text)
 
     first_nhs_no = nhs_no_df["subject_nhs_number"].iloc[0]
-    try:
-        verify_subject_event_status_by_nhs_no(page, first_nhs_no, latest_event_status)
-        logging.info(
-            f"Successfully verified NHS number {first_nhs_no} with status {latest_event_status}"
-        )
-    except Exception as e:
-        pytest.fail(f"Verification failed for NHS number {first_nhs_no}: {str(e)}")
+    verify_subject_event_status_by_nhs_no(page, first_nhs_no, latest_event_status)
 
     if run_timed_events:
         OracleDB().exec_bcss_timed_events(nhs_no_df)
 
 
-def prepare_and_print_batch(page: Page, link_text) -> None:
+def prepare_and_print_batch(
+    page: Page, link_text: str, get_subjects_from_pdf: bool = False
+) -> pd.DataFrame | None:
     """
     This method prepares the batch, retreives the files and confirms them as printed
     Once those buttons have been pressed it waits for the message 'Batch Successfully Archived'
@@ -114,6 +108,8 @@ def prepare_and_print_batch(page: Page, link_text) -> None:
             file = download_file.suggested_filename
             # Wait for the download process to complete and save the downloaded file in a temp folder
             download_file.save_as(file)
+            if file.endswith(".pdf") and get_subjects_from_pdf:
+                nhs_no_df = extract_nhs_no_from_pdf(file)
             os.remove(file)  # Deletes the file after extracting the necessary data
     except Exception as e:
         pytest.fail(f"No retrieve button available to click: {str(e)}")
@@ -137,6 +133,8 @@ def prepare_and_print_batch(page: Page, link_text) -> None:
     except Exception as e:
         pytest.fail(f"Batch successfully archived message is not shown: {str(e)}")
 
+    return nhs_no_df if "nhs_no_df" in locals() else None
+
 
 def check_batch_in_archived_batch_list(page: Page, link_text) -> None:
     """
diff --git a/utils/pdf_reader.py b/utils/pdf_reader.py
@@ -0,0 +1,19 @@
+from pypdf import PdfReader
+import pandas as pd
+
+
+def extract_nhs_no_from_pdf(file: str) -> pd.DataFrame:
+    reader = PdfReader(file)
+    nhs_no_df = pd.DataFrame(columns=["subject_nhs_number"])
+    # For loop looping through all pages of the file to find the NHS Number
+    for i, pages in enumerate(reader.pages):
+        text = pages.extract_text()
+        if "NHS No" in text:
+            # If NHS number is found split the text by every new line into a list
+            text = text.splitlines(True)
+            for split_text in text:
+                if "NHS No" in split_text:
+                    # If a string is found containing "NHS No" only digits are stored into nhs_no
+                    nhs_no = "".join([ele for ele in split_text if ele.isdigit()])
+                    nhs_no_df.loc[i] = [nhs_no]
+    return nhs_no_df

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ def test_compartment_1(page: Page, smokescreen_properties: dict) -> None:`
`88`	`88`	`"Skipping S1 Pre-invitation (FIT) (digital leaflet) as no self referral invitations were generated"`
`89`	`89`	`)`
`90`	`90`	`batch_processing(`
`91`		`- page, "S1", "Pre-invitation (FIT)", "S9 - Pre-invitation Sent", True`
	`91`	`+ page, "S1", "Pre-invitation (FIT)", "S9 - Pre-invitation Sent", True, True`
`92`	`92`	`)`
`93`	`93`
`94`	`94`	`# Print the batch of Invitation & Test Kit Letters - England`