11import os
2- import shutil
3- import tempfile
42import uuid
3+ from concurrent .futures import ThreadPoolExecutor
4+ from io import BytesIO
55from urllib import parse
66
77from botocore .exceptions import ClientError
88from enums .lambda_error import LambdaError
99from enums .trace_status import TraceStatus
1010from models .document_reference import DocumentReference
1111from models .stitch_trace import StitchTrace
12+ from pikepdf import Pdf
1213from pypdf .errors import PyPdfError
1314from services .base .s3_service import S3Service
1415from services .document_service import DocumentService
15- from services .pdf_stitch_service import stitch_pdf
1616from utils .audit_logging_setup import LoggingService
1717from utils .exceptions import NoAvailableDocument
1818from utils .filename_utils import extract_page_number
1919from utils .lambda_exceptions import LGStitchServiceException
2020from utils .lloyd_george_validator import check_for_number_of_files_match_expected
21- from utils .utilities import create_reference_id , get_file_key_from_s3_url
21+ from utils .utilities import get_file_key_from_s3_url
2222
2323logger = LoggingService (__name__ )
2424
@@ -33,70 +33,107 @@ def __init__(self, stitch_trace: StitchTrace):
3333
3434 self .s3_service = S3Service ()
3535 self .document_service = DocumentService ()
36- self .temp_folder = tempfile .mkdtemp ()
3736 self .stitch_trace_object = stitch_trace
3837 self .stitch_trace_table = os .environ .get ("STITCH_METADATA_DYNAMODB_NAME" )
3938 self .stitch_file_name = f"patient-record-{ str (uuid .uuid4 ())} "
40- self .stitch_file_path = os . path . join ( self . temp_folder , self . stitch_file_name )
39+ self .combined_file_folder = "combined_files"
4140
4241 def handle_stitch_request (self ):
4342 self .stitch_lloyd_george_record ()
4443 self .update_stitch_job_complete ()
4544
4645 def stitch_lloyd_george_record (self ):
4746 try :
48- all_lg_parts = self .get_documents_for_stitching ()
49- stitched_lg_record = stitch_pdf ( all_lg_parts , self . temp_folder )
50- filename_for_stitched_file = os . path . basename ( stitched_lg_record )
47+ documents_for_stitching = self .get_lloyd_george_record_for_patient ()
48+ if not documents_for_stitching :
49+ raise LGStitchServiceException ( 404 , LambdaError . StitchNotFound )
5150
52- self .stitch_trace_object .total_file_size_in_bytes = (
53- self .get_total_file_size_in_bytes (all_lg_parts )
54- )
55- self .upload_stitched_lg_record (
56- stitched_lg_record = stitched_lg_record ,
57- filename_on_bucket = f"combined_files/{ filename_for_stitched_file } " ,
58- )
59- logger .audit_splunk_info (
60- "User has viewed Lloyd George records" ,
61- {"Result" : "Successful viewing LG" },
62- )
51+ if len (documents_for_stitching ) == 1 :
52+ document_to_stitch = documents_for_stitching [0 ]
53+ file_location = document_to_stitch .file_location
54+ file_s3_key = get_file_key_from_s3_url (file_location )
55+
56+ self .prepare_documents_for_stitching (documents_for_stitching )
57+ self .stitch_trace_object .total_file_size_in_bytes = (
58+ self .get_total_file_size_in_bytes (document = document_to_stitch )
59+ )
60+ self .stitch_trace_object .stitched_file_location = file_s3_key
61+
62+ else :
63+ filename_for_stitched_file = f"{ self .stitch_file_name } .pdf"
64+ destination_key = (
65+ f"{ self .combined_file_folder } /{ filename_for_stitched_file } "
66+ )
67+ ordered_documents = self .prepare_documents_for_stitching (
68+ documents_for_stitching
69+ )
70+ stitched_lg_stream = self .stream_and_stitch_documents (ordered_documents )
71+ self .stitch_trace_object .total_file_size_in_bytes = (
72+ stitched_lg_stream .getbuffer ().nbytes
73+ )
74+
75+ self .upload_stitched_lg_record (
76+ stitched_lg_stream = stitched_lg_stream ,
77+ filename_on_bucket = destination_key ,
78+ )
79+
80+ self .stitch_trace_object .stitched_file_location = destination_key
81+
82+ logger .audit_splunk_info (
83+ "User has viewed Lloyd George records" ,
84+ {"Result" : "Successful viewing LG" },
85+ )
6386
6487 except (ClientError , PyPdfError , FileNotFoundError , NoAvailableDocument ) as e :
6588 logger .error (
6689 f"{ LambdaError .StitchClient .to_str ()} : { str (e )} " ,
6790 {"Result" : "Lloyd George stitching failed" },
6891 )
6992 raise LGStitchServiceException (500 , LambdaError .StitchClient )
70- finally :
71- shutil .rmtree (self .temp_folder )
7293
73- def get_documents_for_stitching (self ):
74- try :
75- documents_for_stitching = self .get_lloyd_george_record_for_patient ()
76- if not documents_for_stitching :
77- raise LGStitchServiceException (404 , LambdaError .StitchNotFound )
94+ def fetch_pdf (self , doc : DocumentReference ) -> Pdf :
95+ s3_key = get_file_key_from_s3_url (doc .file_location )
96+ stream = self .s3_service .stream_s3_object_to_memory (
97+ bucket = self .lloyd_george_bucket_name ,
98+ key = s3_key ,
99+ )
100+ stream .seek (0 )
101+ return Pdf .open (stream )
102+
103+ def stream_and_stitch_documents (
104+ self , documents : list [DocumentReference ]
105+ ) -> BytesIO :
106+ output_pdf = Pdf .new ()
107+
108+ with ThreadPoolExecutor (max_workers = 5 ) as executor :
109+ futures = [executor .submit (self .fetch_pdf , doc ) for doc in documents ]
110+
111+ for future in futures :
112+ pdf = future .result ()
113+ output_pdf .pages .extend (pdf .pages )
114+ pdf .close ()
115+
116+ output_stream = BytesIO ()
117+ output_pdf .save (output_stream )
118+ output_pdf .close ()
119+ output_stream .seek (0 )
120+ return output_stream
121+
122+ def prepare_documents_for_stitching (
123+ self , documents : list [DocumentReference ]
124+ ) -> list [DocumentReference ]:
125+ self .update_trace_status (TraceStatus .PROCESSING )
126+
127+ if len (documents ) == 1 :
128+ sorted_docs = documents
129+ else :
130+ sorted_docs = self .sort_documents_by_filenames (documents )
131+ self .stitch_trace_object .number_of_files = len (sorted_docs )
132+ self .stitch_trace_object .file_last_updated = self .get_most_recent_created_date (
133+ sorted_docs
134+ )
78135
79- self .update_trace_status (TraceStatus .PROCESSING )
80- sorted_documents_for_stitching = self .sort_documents_by_filenames (
81- documents_for_stitching
82- )
83- all_lg_parts = self .download_lloyd_george_files (
84- sorted_documents_for_stitching
85- )
86- self .stitch_trace_object .number_of_files = len (documents_for_stitching )
87- self .stitch_trace_object .file_last_updated = (
88- self .get_most_recent_created_date (sorted_documents_for_stitching )
89- )
90- except ClientError as e :
91- logger .error (
92- f"{ LambdaError .StitchNoService .to_str ()} : { str (e )} " ,
93- {"Result" : "Lloyd George stitching failed" },
94- )
95- raise LGStitchServiceException (
96- 500 ,
97- LambdaError .StitchNoService ,
98- )
99- return all_lg_parts
136+ return sorted_docs
100137
101138 @staticmethod
102139 def sort_documents_by_filenames (
@@ -111,53 +148,39 @@ def sort_documents_by_filenames(
111148 )
112149 raise LGStitchServiceException (500 , LambdaError .StitchValidation )
113150
114- def download_lloyd_george_files (
115- self ,
116- ordered_lg_records : list [DocumentReference ],
117- ) -> list [str ]:
118- all_lg_parts = []
119-
120- for lg_part in ordered_lg_records :
121- file_location_on_s3 = lg_part .file_location
122- s3_file_path = get_file_key_from_s3_url (file_location_on_s3 )
123- local_file_name = os .path .join (self .temp_folder , create_reference_id ())
124- self .s3_service .download_file (
125- self .lloyd_george_bucket_name , s3_file_path , local_file_name
126- )
127- all_lg_parts .append (local_file_name )
128-
129- return all_lg_parts
130-
131151 def upload_stitched_lg_record (
132- self , stitched_lg_record : str , filename_on_bucket : str
152+ self , stitched_lg_stream : BytesIO , filename_on_bucket : str
133153 ):
134154 try :
135155 extra_args = {
136156 "Tagging" : parse .urlencode ({self .lifecycle_policy_tag : "true" }),
137157 "ContentDisposition" : "inline" ,
138158 "ContentType" : "application/pdf" ,
139159 }
140- self .s3_service .upload_file_with_extra_args (
141- file_name = stitched_lg_record ,
160+ self .s3_service .upload_file_obj (
161+ file_obj = stitched_lg_stream ,
142162 s3_bucket_name = self .lloyd_george_bucket_name ,
143163 file_key = filename_on_bucket ,
144164 extra_args = extra_args ,
145165 )
146- self .stitch_trace_object .stitched_file_location = filename_on_bucket
166+ logger .info (
167+ f"Uploaded stitched file to { self .lloyd_george_bucket_name } with key { filename_on_bucket } "
168+ )
147169 except ValueError as e :
148170 logger .error (
149171 f"{ LambdaError .StitchCloudFront .to_str ()} : { str (e )} " ,
150- {"Result" : "Failed to format CloudFront URL due to invalid input ." },
172+ {"Result" : "Failed to format CloudFront URL." },
151173 )
152174 raise LGStitchServiceException (500 , LambdaError .StitchCloudFront )
153175
154176 @staticmethod
155177 def get_most_recent_created_date (documents : list [DocumentReference ]) -> str :
156178 return max (doc .created for doc in documents )
157179
158- @staticmethod
159- def get_total_file_size_in_bytes (filepaths : list [str ]) -> int :
160- return sum (os .path .getsize (filepath ) for filepath in filepaths )
180+ def get_total_file_size_in_bytes (self , document : DocumentReference ) -> int :
181+ bucket = document .s3_bucket_name
182+ key = document .s3_file_key
183+ return self .s3_service .get_file_size (bucket , key )
161184
162185 def update_stitch_job_complete (self ):
163186 logger .info ("Writing stitch trace to db" )
0 commit comments