Skip to content

Commit 6c8961f

Browse files
authored
Merge pull request #301 from ricanontherun/generate-upload-ttle
Generate upload ttle
2 parents e590a4b + 016d6cc commit 6c8961f

File tree

3 files changed

+133
-4
lines changed

3 files changed

+133
-4
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import unittest
2+
from unittest.mock import MagicMock, patch
3+
4+
import title
5+
6+
class TestGenerateTitle(unittest.TestCase):
7+
def test_prefers_metadata_title_if_valid(self):
8+
doc = MagicMock()
9+
doc.metadata = {"title": "A Study Regarding The Efficacy of Drugs"}
10+
self.assertEqual("A Study Regarding The Efficacy of Drugs", title.generate_title(doc))
11+
12+
def test_falls_back_to_first_page_text_if_metadata_title_is_empty(self):
13+
doc = MagicMock()
14+
doc.metadata = {"title": ""}
15+
doc[0].get_text = MagicMock()
16+
17+
foo_block = [None] * 7
18+
foo_block[4] = "foo"
19+
foo_block[6] = 0
20+
21+
title_block = [None] * 7
22+
title_block[4] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
23+
title_block[6] = 0
24+
25+
bar_block = [None] * 7
26+
bar_block[4] = "bar"
27+
bar_block[6] = 0
28+
doc[0].get_text.return_value = [foo_block, title_block, bar_block]
29+
30+
expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
31+
self.assertEqual(expected_title, title.generate_title(doc))
32+
33+
def test_falls_back_to_first_page_text_if_metadata_title_does_not_match_regex(self):
34+
doc = MagicMock()
35+
doc.metadata = {"title": "abcd1234"}
36+
doc[0].get_text = MagicMock()
37+
38+
foo_block = [None] * 7
39+
foo_block[4] = "foo"
40+
foo_block[6] = 0
41+
42+
title_block = [None] * 7
43+
title_block[4] = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
44+
title_block[6] = 0
45+
46+
bar_block = [None] * 7
47+
bar_block[4] = "bar"
48+
bar_block[6] = 0
49+
doc[0].get_text.return_value = [foo_block, title_block, bar_block]
50+
51+
expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
52+
self.assertEqual(expected_title, title.generate_title(doc))
53+
54+
@patch("server.api.services.openai_services.openAIServices.openAI")
55+
def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI):
56+
doc = MagicMock()
57+
doc.metadata = {"title": None}
58+
doc.get_text.return_value = []
59+
60+
mock_response = MagicMock()
61+
mock_response.choices = [MagicMock()]
62+
mock_response.choices[0].message.content = "A Study Regarding The Efficacy of Drugs"
63+
mock_openAI.return_value = mock_response
64+
65+
title.generate_title(doc)
66+
67+
self.assertTrue(mock_openAI.called)
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import re
2+
3+
import fitz
4+
5+
from server.api.services.openai_services import openAIServices
6+
7+
# regular expression to match common research white paper titles. Created by Chat-gpt
8+
# requires at least 3 words, no dates, no version numbers.
9+
title_regex = re.compile(r'^(?=(?:\b\w+\b[\s:,\-\(\)]*){3,})(?!.*\b(?:19|20)\d{2}\b)(?!.*\bv\d+\b)[A-Za-z0-9][\w\s:,\-\(\)]*[A-Za-z\)]$', re.IGNORECASE)
10+
11+
def generate_title(pdf: fitz.Document) -> str | None:
12+
document_metadata_title = pdf.metadata["title"]
13+
if document_metadata_title is not None and document_metadata_title != "":
14+
if title_regex.match(document_metadata_title):
15+
print("suitable title was found in metadata")
16+
return document_metadata_title.strip()
17+
else:
18+
print("metadata title did not match regex")
19+
20+
print("Looking for title in first page text")
21+
first_page = pdf[0]
22+
first_page_blocks = first_page.get_text("blocks")
23+
text_blocks = [
24+
block[4].strip().replace("\n", " ")
25+
for block in first_page_blocks
26+
if block[6] == 0 # only include text blocks.
27+
]
28+
29+
# For some reason, extracted PDF text has extra spaces. Collapse them here.
30+
regex = r"\s{2,}"
31+
text_blocks = [re.sub(regex, " ", text) for text in text_blocks]
32+
33+
if len(text_blocks) != 0:
34+
for text in text_blocks:
35+
if title_regex.match(text):
36+
return text
37+
38+
print("no suitable title found in first page text. Using GPT-4 to summarize the PDF")
39+
gpt_title = summarize_pdf(pdf)
40+
return gpt_title or None
41+
42+
43+
def summarize_pdf(pdf: fitz.Document) -> str:
44+
"""
45+
Summarize a PDF document using OpenAI's GPT-4 model.
46+
"""
47+
first_page = pdf[0]
48+
first_page_content = first_page.get_text()
49+
50+
if first_page_content is None:
51+
raise Exception("Failed to read the first page of the PDF file")
52+
53+
# UploadFile model title is limited to 255 chars.
54+
prompt = "Please provide a title for this document. The title should be less than 256 characters and will be displayed on a webpage."
55+
response = openAIServices.openAI(first_page_content, prompt, model='gpt-4o', temp=0.0)
56+
return response.choices[0].message.content

server/api/views/uploadFile/views.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from ...models.model_embeddings import Embeddings
1616
import fitz
1717
from django.db import transaction
18+
from .title import generate_title
1819

1920

2021
@method_decorator(csrf_exempt, name='dispatch')
@@ -77,22 +78,27 @@ def post(self, request, format=None):
7778
uploaded_by=request.user, # Set to the user instance
7879
uploaded_by_email=request.user.email # Also store the email separately
7980
)
80-
new_file.save()
81-
82-
if new_file.id is None:
83-
return Response({"message": "Failed to save the upload file."}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
8481

8582
with fitz.open(stream=pdf_binary, filetype="pdf") as doc:
8683
text = ""
8784
page_number = 1 # Initialize page_number
8885
page_texts = [] # List to hold text for each page with page number
8986

87+
title = generate_title(doc)
88+
if title is not None:
89+
new_file.title = title
90+
9091
for page in doc:
9192
page_text = page.get_text()
9293
text += page_text
9394
page_texts.append((page_number, page_text))
95+
9496
page_number += 1
9597

98+
new_file.save()
99+
if new_file.id is None:
100+
return Response({"message": "Failed to save the upload file."}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
101+
96102
chunks_with_page = []
97103

98104
# Create chunks along with their corresponding page number

0 commit comments

Comments
 (0)