|
| 1 | +import re |
| 2 | + |
| 3 | +import fitz |
| 4 | + |
| 5 | +from server.api.services.openai_services import openAIServices |
| 6 | + |
| 7 | +# regular expression to match common research white paper titles. Created by Chat-gpt |
| 8 | +# requires at least 3 words, no dates, no version numbers. |
| 9 | +title_regex = re.compile(r'^(?=(?:\b\w+\b[\s:,\-\(\)]*){3,})(?!.*\b(?:19|20)\d{2}\b)(?!.*\bv\d+\b)[A-Za-z0-9][\w\s:,\-\(\)]*[A-Za-z\)]$', re.IGNORECASE) |
| 10 | + |
| 11 | +def generate_title(pdf: fitz.Document) -> str | None: |
| 12 | + document_metadata_title = pdf.metadata["title"] |
| 13 | + if document_metadata_title is not None and document_metadata_title != "": |
| 14 | + if title_regex.match(document_metadata_title): |
| 15 | + print("suitable title was found in metadata") |
| 16 | + return document_metadata_title.strip() |
| 17 | + else: |
| 18 | + print("metadata title did not match regex") |
| 19 | + |
| 20 | + print("Looking for title in first page text") |
| 21 | + first_page = pdf[0] |
| 22 | + first_page_blocks = first_page.get_text("blocks") |
| 23 | + text_blocks = [ |
| 24 | + block[4].strip().replace("\n", " ") |
| 25 | + for block in first_page_blocks |
| 26 | + if block[6] == 0 # only include text blocks. |
| 27 | + ] |
| 28 | + |
| 29 | + # For some reason, extracted PDF text has extra spaces. Collapse them here. |
| 30 | + regex = r"\s{2,}" |
| 31 | + text_blocks = [re.sub(regex, " ", text) for text in text_blocks] |
| 32 | + |
| 33 | + if len(text_blocks) != 0: |
| 34 | + for text in text_blocks: |
| 35 | + if title_regex.match(text): |
| 36 | + return text |
| 37 | + |
| 38 | + print("no suitable title found in first page text. Using GPT-4 to summarize the PDF") |
| 39 | + gpt_title = summarize_pdf(pdf) |
| 40 | + return gpt_title or None |
| 41 | + |
| 42 | + |
| 43 | +def summarize_pdf(pdf: fitz.Document) -> str: |
| 44 | + """ |
| 45 | + Summarize a PDF document using OpenAI's GPT-4 model. |
| 46 | + """ |
| 47 | + first_page = pdf[0] |
| 48 | + first_page_content = first_page.get_text() |
| 49 | + |
| 50 | + if first_page_content is None: |
| 51 | + raise Exception("Failed to read the first page of the PDF file") |
| 52 | + |
| 53 | + # UploadFile model title is limited to 255 chars. |
| 54 | + prompt = "Please provide a title for this document. The title should be less than 256 characters and will be displayed on a webpage." |
| 55 | + response = openAIServices.openAI(first_page_content, prompt, model='gpt-4o', temp=0.0) |
| 56 | + return response.choices[0].message.content |
0 commit comments