|
| 1 | +class PdfPaperCreator |
| 2 | + require "net/http" |
| 3 | + require "uri" |
| 4 | + require "digest" |
| 5 | + require "stringio" |
| 6 | + |
| 7 | + def initialize(url) |
| 8 | + @url = url |
| 9 | + end |
| 10 | + |
| 11 | + def self.create_from_url(url) |
| 12 | + new(url).create |
| 13 | + end |
| 14 | + |
| 15 | + def create |
| 16 | + return false unless pdf_url?(@url) |
| 17 | + |
| 18 | + paper = Paper.new(url: pdf_url) |
| 19 | + |
| 20 | + # Set title and description |
| 21 | + if @url.include?("arxiv.org/abs/") |
| 22 | + # Use metainspector to get title and description from the abstract page |
| 23 | + page = MetaInspector.new(@url) |
| 24 | + paper.title = page.best_title |
| 25 | + paper.description = page.best_description |
| 26 | + else |
| 27 | + paper.title = "PDF Document" |
| 28 | + paper.description = "PDF document from #{@url}" |
| 29 | + end |
| 30 | + |
| 31 | + if paper.save |
| 32 | + # Download and attach the PDF |
| 33 | + download_and_attach_pdf(paper, pdf_url) |
| 34 | + paper |
| 35 | + else |
| 36 | + false |
| 37 | + end |
| 38 | + end |
| 39 | + |
| 40 | + private |
| 41 | + |
| 42 | + def pdf_url?(url) |
| 43 | + return false unless url |
| 44 | + |
| 45 | + # Special handling for arxiv URLs |
| 46 | + if url.include?("arxiv.org/abs/") |
| 47 | + return true |
| 48 | + end |
| 49 | + |
| 50 | + uri = URI.parse(url) |
| 51 | + return false unless uri.is_a?(URI::HTTP) |
| 52 | + |
| 53 | + # Make a HEAD request to check content type |
| 54 | + response = Net::HTTP.new(uri.host, uri.port) |
| 55 | + response.use_ssl = uri.scheme == "https" |
| 56 | + begin |
| 57 | + head_response = response.request_head(uri.request_uri) |
| 58 | + content_type = head_response["content-type"] |
| 59 | + content_type&.include?("application/pdf") |
| 60 | + rescue |
| 61 | + false |
| 62 | + end |
| 63 | + end |
| 64 | + |
| 65 | + def pdf_url |
| 66 | + if @url.include?("arxiv.org/abs/") |
| 67 | + @url.gsub("arxiv.org/abs/", "arxiv.org/pdf/") |
| 68 | + else |
| 69 | + @url |
| 70 | + end |
| 71 | + end |
| 72 | + |
| 73 | + def download_and_attach_pdf(paper, url) |
| 74 | + # Download the PDF and attach it to the paper |
| 75 | + begin |
| 76 | + uri = URI.parse(url) |
| 77 | + response = Net::HTTP.new(uri.host, uri.port) |
| 78 | + response.use_ssl = uri.scheme == "https" |
| 79 | + http_response = response.get(uri.request_uri) |
| 80 | + |
| 81 | + # Check if the response is successful |
| 82 | + if http_response.code == "200" |
| 83 | + paper.pdf.attach( |
| 84 | + io: StringIO.new(http_response.body), |
| 85 | + filename: "#{Digest::SHA2.hexdigest(url)}.pdf", |
| 86 | + content_type: "application/pdf" |
| 87 | + ) |
| 88 | + else |
| 89 | + Rails.logger.error "Failed to download PDF: HTTP #{http_response.code} for #{url}" |
| 90 | + end |
| 91 | + rescue => e |
| 92 | + Rails.logger.error "Failed to download PDF: #{e.message}" |
| 93 | + end |
| 94 | + end |
| 95 | +end |
0 commit comments