balancer-main/server/api/views/uploadFile/title.py at d47606726c9628eeb01a774b3c374c6e808f3efa · sahilds1/balancer-main · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import re

import fitz

from ...services.openai_services import openAIServices

# regular expression to match common research white paper titles. Created by Chat-gpt
# requires at least 3 words, no dates, no version numbers.
title_regex = re.compile(r'^(?=(?:\b\w+\b[\s:,\-\(\)]*){3,})(?!.*\b(?:19|20)\d{2}\b)(?!.*\bv\d+\b)[A-Za-z0-9][\w\s:,\-\(\)]*[A-Za-z\)]$', re.IGNORECASE)

def generate_title(pdf: fitz.Document) -> str | None:
    document_metadata_title = pdf.metadata["title"]
    if document_metadata_title is not None and document_metadata_title != "":
        if title_regex.match(document_metadata_title):
            print("suitable title was found in metadata")
            return document_metadata_title.strip()
        else:
            print("metadata title did not match regex")

    print("Looking for title in first page text")
    first_page = pdf[0]
    first_page_blocks = first_page.get_text("blocks")
    text_blocks = [
        block[4].strip().replace("\n", " ")
        for block in first_page_blocks
        if block[6] == 0 # only include text blocks.
    ]

    # For some reason, extracted PDF text has extra spaces. Collapse them here.
    regex = r"\s{2,}"
    text_blocks = [re.sub(regex, " ", text) for text in text_blocks]

    if len(text_blocks) != 0:
        for text in text_blocks:
            if title_regex.match(text):
                return text

    print("no suitable title found in first page text. Using GPT-4 to summarize the PDF")
    gpt_title = summarize_pdf(pdf)
    return gpt_title or None


def summarize_pdf(pdf: fitz.Document) -> str:
    """
    Summarize a PDF document using OpenAI's GPT-4 model.
    """
    first_page = pdf[0]
    first_page_content = first_page.get_text()

    if first_page_content is None:
        raise Exception("Failed to read the first page of the PDF file")

    # UploadFile model title is limited to 255 chars.
    prompt = "Please provide a title for this document. The title should be less than 256 characters and will be displayed on a webpage."
    response = openAIServices.openAI(first_page_content, prompt, model='gpt-4o', temp=0.0)
    return response.choices[0].message.content