Skip to content
This repository was archived by the owner on Jul 22, 2025. It is now read-only.

Commit bdacc92

Browse files
committed
FEATURE: PDF support for rag pipeline
(this starts by defining the extraction routines)
1 parent a996aa4 commit bdacc92

File tree

1 file changed

+175
-0
lines changed

1 file changed

+175
-0
lines changed

lib/utils/pdf_to_text.rb

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
# frozen_string_literal: true
2+
3+
class DiscourseAi::Utils::PdfToText
4+
MAX_PDF_SIZE = 100.megabytes
5+
MAX_CONVERT_SECONDS = 30
6+
BACKOFF_SECONDS = [5, 30, 60]
7+
8+
attr_reader :upload, :llm_model, :user
9+
10+
def initialize(upload:, llm_model:, user:)
11+
@upload = upload
12+
@llm_model = llm_model
13+
@user = user
14+
@uploaded_pages = UploadReference.where(target: upload).map(&:upload)
15+
end
16+
17+
def extract_pages
18+
temp_dir = File.join(Dir.tmpdir, "discourse-pdf-#{SecureRandom.hex(8)}")
19+
FileUtils.mkdir_p(temp_dir)
20+
21+
begin
22+
pdf_path =
23+
if upload.local?
24+
Discourse.store.path_for(upload)
25+
else
26+
Discourse.store.download_safe(upload, max_file_size_kb: MAX_PDF_SIZE)&.path
27+
end
28+
29+
raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil?
30+
31+
temp_pdf = File.join(temp_dir, "source.pdf")
32+
FileUtils.cp(pdf_path, temp_pdf)
33+
34+
# Convert PDF to individual page images
35+
output_pattern = File.join(temp_dir, "page-%04d.png")
36+
37+
command = [
38+
"magick",
39+
"-density",
40+
"300",
41+
temp_pdf,
42+
"-background",
43+
"white",
44+
"-auto-orient",
45+
"-quality",
46+
"85",
47+
output_pattern,
48+
]
49+
50+
Discourse::Utils.execute_command(
51+
*command,
52+
failure_message: "Failed to convert PDF to images",
53+
timeout: MAX_CONVERT_SECONDS,
54+
)
55+
56+
uploads = []
57+
Dir
58+
.glob(File.join(temp_dir, "page-*.png"))
59+
.sort
60+
.each do |page_path|
61+
upload =
62+
UploadCreator.new(File.open(page_path), "page-#{File.basename(page_path)}").create_for(
63+
@user.id,
64+
)
65+
66+
uploads << upload
67+
end
68+
69+
# Create upload references
70+
UploadReference.ensure_exist!(upload_ids: uploads.map(&:id), target: @upload)
71+
72+
@uploaded_pages = uploads
73+
ensure
74+
FileUtils.rm_rf(temp_dir) if Dir.exist?(temp_dir)
75+
end
76+
end
77+
78+
def extract_text(uploads: nil, retries: 3)
79+
uploads ||= @uploaded_pages
80+
81+
raise "must specify a block" if !block_given?
82+
uploads
83+
.map do |upload|
84+
extracted = nil
85+
error = nil
86+
87+
backoff = BACKOFF_SECONDS.dup
88+
89+
retries.times do
90+
seconds = nil
91+
begin
92+
extracted = extract_text_from_page(upload)
93+
break
94+
rescue => e
95+
error = e
96+
seconds = backoff.shift || seconds
97+
sleep(seconds)
98+
end
99+
end
100+
if extracted
101+
extracted.each { |chunk| yield(chunk, upload) }
102+
else
103+
yield(nil, upload, error)
104+
end
105+
extracted || []
106+
end
107+
.flatten
108+
end
109+
110+
private
111+
112+
def system_message
113+
<<~MSG
114+
OCR the following page into Markdown. Tables should be formatted as Github flavored markdown.
115+
Do not sorround your output with triple backticks.
116+
117+
Chunk the document into sections of roughly 250 - 1000 words. Our goal is to identify parts of the page with same semantic theme. These chunks will be embedded and used in a RAG pipeline.
118+
119+
Always prefer returning text in Markdown vs HTML.
120+
Describe all the images and graphs you encounter.
121+
Only return text that will assist in the querying of data. Omit text such as "I had trouble recognizing images" and so on.
122+
123+
Surround the chunks with <chunk> </chunk> html tags.
124+
MSG
125+
end
126+
127+
def extract_text_from_page(page)
128+
llm = llm_model.to_llm
129+
messages = [{ type: :user, content: "process the following page", upload_ids: [page.id] }]
130+
prompt = DiscourseAi::Completions::Prompt.new(system_message, messages: messages)
131+
result = llm.generate(prompt, user: Discourse.system_user)
132+
extract_chunks(result)
133+
end
134+
135+
def extract_chunks(text)
136+
return [] if text.nil? || text.empty?
137+
138+
if text.include?("<chunk>") && text.include?("</chunk>")
139+
chunks = []
140+
remaining_text = text.dup
141+
142+
while remaining_text.length > 0
143+
if remaining_text.start_with?("<chunk>")
144+
# Extract chunk content
145+
chunk_end = remaining_text.index("</chunk>")
146+
if chunk_end
147+
chunk = remaining_text[7..chunk_end - 1].strip
148+
chunks << chunk unless chunk.empty?
149+
remaining_text = remaining_text[chunk_end + 8..-1] || ""
150+
else
151+
# Malformed chunk - add remaining text and break
152+
chunks << remaining_text[7..-1].strip
153+
break
154+
end
155+
else
156+
# Handle text before next chunk if it exists
157+
next_chunk = remaining_text.index("<chunk>")
158+
if next_chunk
159+
text_before = remaining_text[0...next_chunk].strip
160+
chunks << text_before unless text_before.empty?
161+
remaining_text = remaining_text[next_chunk..-1]
162+
else
163+
# No more chunks - add remaining text and break
164+
chunks << remaining_text.strip
165+
break
166+
end
167+
end
168+
end
169+
170+
return chunks.reject(&:empty?)
171+
end
172+
173+
[text]
174+
end
175+
end

0 commit comments

Comments
 (0)