|
| 1 | +# frozen_string_literal: true |
| 2 | + |
| 3 | +class DiscourseAi::Utils::PdfToText |
| 4 | + MAX_PDF_SIZE = 100.megabytes |
| 5 | + MAX_CONVERT_SECONDS = 30 |
| 6 | + BACKOFF_SECONDS = [5, 30, 60] |
| 7 | + |
| 8 | + attr_reader :upload, :llm_model, :user |
| 9 | + |
| 10 | + def initialize(upload:, llm_model:, user:) |
| 11 | + @upload = upload |
| 12 | + @llm_model = llm_model |
| 13 | + @user = user |
| 14 | + @uploaded_pages = UploadReference.where(target: upload).map(&:upload) |
| 15 | + end |
| 16 | + |
| 17 | + def extract_pages |
| 18 | + temp_dir = File.join(Dir.tmpdir, "discourse-pdf-#{SecureRandom.hex(8)}") |
| 19 | + FileUtils.mkdir_p(temp_dir) |
| 20 | + |
| 21 | + begin |
| 22 | + pdf_path = |
| 23 | + if upload.local? |
| 24 | + Discourse.store.path_for(upload) |
| 25 | + else |
| 26 | + Discourse.store.download_safe(upload, max_file_size_kb: MAX_PDF_SIZE)&.path |
| 27 | + end |
| 28 | + |
| 29 | + raise Discourse::InvalidParameters.new("Failed to download PDF") if pdf_path.nil? |
| 30 | + |
| 31 | + temp_pdf = File.join(temp_dir, "source.pdf") |
| 32 | + FileUtils.cp(pdf_path, temp_pdf) |
| 33 | + |
| 34 | + # Convert PDF to individual page images |
| 35 | + output_pattern = File.join(temp_dir, "page-%04d.png") |
| 36 | + |
| 37 | + command = [ |
| 38 | + "magick", |
| 39 | + "-density", |
| 40 | + "300", |
| 41 | + temp_pdf, |
| 42 | + "-background", |
| 43 | + "white", |
| 44 | + "-auto-orient", |
| 45 | + "-quality", |
| 46 | + "85", |
| 47 | + output_pattern, |
| 48 | + ] |
| 49 | + |
| 50 | + Discourse::Utils.execute_command( |
| 51 | + *command, |
| 52 | + failure_message: "Failed to convert PDF to images", |
| 53 | + timeout: MAX_CONVERT_SECONDS, |
| 54 | + ) |
| 55 | + |
| 56 | + uploads = [] |
| 57 | + Dir |
| 58 | + .glob(File.join(temp_dir, "page-*.png")) |
| 59 | + .sort |
| 60 | + .each do |page_path| |
| 61 | + upload = |
| 62 | + UploadCreator.new(File.open(page_path), "page-#{File.basename(page_path)}").create_for( |
| 63 | + @user.id, |
| 64 | + ) |
| 65 | + |
| 66 | + uploads << upload |
| 67 | + end |
| 68 | + |
| 69 | + # Create upload references |
| 70 | + UploadReference.ensure_exist!(upload_ids: uploads.map(&:id), target: @upload) |
| 71 | + |
| 72 | + @uploaded_pages = uploads |
| 73 | + ensure |
| 74 | + FileUtils.rm_rf(temp_dir) if Dir.exist?(temp_dir) |
| 75 | + end |
| 76 | + end |
| 77 | + |
| 78 | + def extract_text(uploads: nil, retries: 3) |
| 79 | + uploads ||= @uploaded_pages |
| 80 | + |
| 81 | + raise "must specify a block" if !block_given? |
| 82 | + uploads |
| 83 | + .map do |upload| |
| 84 | + extracted = nil |
| 85 | + error = nil |
| 86 | + |
| 87 | + backoff = BACKOFF_SECONDS.dup |
| 88 | + |
| 89 | + retries.times do |
| 90 | + seconds = nil |
| 91 | + begin |
| 92 | + extracted = extract_text_from_page(upload) |
| 93 | + break |
| 94 | + rescue => e |
| 95 | + error = e |
| 96 | + seconds = backoff.shift || seconds |
| 97 | + sleep(seconds) |
| 98 | + end |
| 99 | + end |
| 100 | + if extracted |
| 101 | + extracted.each { |chunk| yield(chunk, upload) } |
| 102 | + else |
| 103 | + yield(nil, upload, error) |
| 104 | + end |
| 105 | + extracted || [] |
| 106 | + end |
| 107 | + .flatten |
| 108 | + end |
| 109 | + |
| 110 | + private |
| 111 | + |
| 112 | + def system_message |
| 113 | + <<~MSG |
| 114 | + OCR the following page into Markdown. Tables should be formatted as Github flavored markdown. |
| 115 | + Do not sorround your output with triple backticks. |
| 116 | +
|
| 117 | + Chunk the document into sections of roughly 250 - 1000 words. Our goal is to identify parts of the page with same semantic theme. These chunks will be embedded and used in a RAG pipeline. |
| 118 | +
|
| 119 | + Always prefer returning text in Markdown vs HTML. |
| 120 | + Describe all the images and graphs you encounter. |
| 121 | + Only return text that will assist in the querying of data. Omit text such as "I had trouble recognizing images" and so on. |
| 122 | +
|
| 123 | + Surround the chunks with <chunk> </chunk> html tags. |
| 124 | + MSG |
| 125 | + end |
| 126 | + |
| 127 | + def extract_text_from_page(page) |
| 128 | + llm = llm_model.to_llm |
| 129 | + messages = [{ type: :user, content: "process the following page", upload_ids: [page.id] }] |
| 130 | + prompt = DiscourseAi::Completions::Prompt.new(system_message, messages: messages) |
| 131 | + result = llm.generate(prompt, user: Discourse.system_user) |
| 132 | + extract_chunks(result) |
| 133 | + end |
| 134 | + |
| 135 | + def extract_chunks(text) |
| 136 | + return [] if text.nil? || text.empty? |
| 137 | + |
| 138 | + if text.include?("<chunk>") && text.include?("</chunk>") |
| 139 | + chunks = [] |
| 140 | + remaining_text = text.dup |
| 141 | + |
| 142 | + while remaining_text.length > 0 |
| 143 | + if remaining_text.start_with?("<chunk>") |
| 144 | + # Extract chunk content |
| 145 | + chunk_end = remaining_text.index("</chunk>") |
| 146 | + if chunk_end |
| 147 | + chunk = remaining_text[7..chunk_end - 1].strip |
| 148 | + chunks << chunk unless chunk.empty? |
| 149 | + remaining_text = remaining_text[chunk_end + 8..-1] || "" |
| 150 | + else |
| 151 | + # Malformed chunk - add remaining text and break |
| 152 | + chunks << remaining_text[7..-1].strip |
| 153 | + break |
| 154 | + end |
| 155 | + else |
| 156 | + # Handle text before next chunk if it exists |
| 157 | + next_chunk = remaining_text.index("<chunk>") |
| 158 | + if next_chunk |
| 159 | + text_before = remaining_text[0...next_chunk].strip |
| 160 | + chunks << text_before unless text_before.empty? |
| 161 | + remaining_text = remaining_text[next_chunk..-1] |
| 162 | + else |
| 163 | + # No more chunks - add remaining text and break |
| 164 | + chunks << remaining_text.strip |
| 165 | + break |
| 166 | + end |
| 167 | + end |
| 168 | + end |
| 169 | + |
| 170 | + return chunks.reject(&:empty?) |
| 171 | + end |
| 172 | + |
| 173 | + [text] |
| 174 | + end |
| 175 | +end |
0 commit comments