tosdr · madoleary · Apr 29, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -13,12 +13,14 @@ COPY . .
 RUN apt-get update -qq && apt-get install -y build-essential libpq-dev postgresql postgresql-contrib openssl sudo && \
     curl -sS http://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - && \
     echo "deb http://dl.yarnpkg.com/debian/ stable main" | tee /etc/apt/sources.list.d/yarn.list && \
-    curl -sL https://deb.nodesource.com/setup_12.x | bash - && \
+    curl -sL https://deb.nodesource.com/setup_16.x | bash - && \
     apt-get update -qq && apt-get install -y yarn nodejs && \
     apt clean && \
     rm -rf /var/lib/apt/lists/* && \
     yarn
 
+RUN apt-get update && apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libasound2
+
 RUN gem install bundler -v 2.4.14
 COPY Gemfile Gemfile.lock ./
 

diff --git a/README.md b/README.md
@@ -93,6 +93,11 @@ So,
 
 5. Sign-in
 
+6. To debug the db, try `docker exec -it db psql -U postgres`. Due to a bug in the seeds you will currently need to:
+```
+insert into document_types values (0, 'terms of service', now(), now(), null, 1, 'approved');
+update documents set url='http://example.com', selector='body';
+```
 To **annotate** a service, navigate to the services page from the top-right menu, choose a service, and click `View Documents`. Begin by highlighting a piece of text from this page. **H and the Hypothesis client must be running.**
 
 For a demonstration of how annotations work, feel free to [inspect the video attached to this PR](https://github.com/tosdr/edit.tosdr.org/pull/1116).

diff --git a/app/assets/stylesheets/components/_alert.scss b/app/assets/stylesheets/components/_alert.scss
@@ -4,12 +4,4 @@
 
 .alert {
   margin: -10px 0 10px;
-  text-align: center;
-  color: white;
-}
-.alert-info {
-  background: $green;
-}
-.alert-warning {
-  background: $red;
 }
diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb
@@ -9,6 +9,8 @@ class ApplicationController < ActionController::Base
   before_action :configure_permitted_parameters, if: :devise_controller?
   before_action :set_paper_trail_whodunnit
 
+  add_flash_types :info, :error, :warning
+
   def configure_permitted_parameters
     # For additional in app/views/devise/registrations/edit.html.erb
     devise_parameter_sanitizer.permit(:account_update, keys: [:username])

diff --git a/app/controllers/documents_controller.rb b/app/controllers/documents_controller.rb
@@ -10,26 +10,11 @@
 class DocumentsController < ApplicationController
   include Pundit::Authorization
 
-  PROD_CRAWLERS = {
-    "https://api.tosdr.org/crawl/v1": 'Random',
-    "https://api.tosdr.org/crawl/v1/eu": 'Europe (Recommended)',
-    "https://api.tosdr.org/crawl/v1/us": 'United States (Recommended)',
-    "https://api.tosdr.org/crawl/v1/eu-central": 'Europe (Central)',
-    "https://api.tosdr.org/crawl/v1/eu-west": 'Europe (West)',
-    "https://api.tosdr.org/crawl/v1/us-east": 'United States (East)',
-    "https://api.tosdr.org/crawl/v1/us-west": 'United States (West)'
-  }.freeze
-
-  DEV_CRAWLERS = {
-    "http://localhost:5000": 'Standalone (localhost:5000)',
-    "http://crawler:5000": 'Docker-Compose (crawler:5000)'
-  }.freeze
-
   before_action :authenticate_user!, except: %i[index show]
   before_action :set_document, only: %i[show edit update crawl restore_points]
   before_action :set_services, only: %i[new edit create update]
   before_action :set_document_names, only: %i[new edit create update]
-  before_action :set_crawlers, only: %i[new edit create update]
+  before_action :set_uri, only: %i[new edit create update crawl]
 
   rescue_from Pundit::NotAuthorizedError, with: :user_not_authorized
 
@@ -55,18 +40,20 @@ def create
     @document.user = current_user
     @document.name = @document.document_type.name if @document.document_type
 
+    document_url = document_params[:url]
+    selector = document_params[:selector]
+
+    request = build_request(document_url, @uri, selector)
+    results = fetch_text(request, @uri, @document)
+
+    @document = results[:document]
+    message = results[:message]
+
     if @document.save
-      crawl_result = perform_crawl
-
-      unless crawl_result.nil?
-        if crawl_result['error']
-          flash[:alert] = crawler_error_message(crawl_result)
-        else
-          flash[:notice] = 'The crawler has updated the document'
-        end
-      end
+      flash[:notice] = message
       redirect_to document_path(@document)
     else
+      flash.now[:warning] = message.html_safe if message
       render :new
     end
   end
@@ -82,25 +69,25 @@ def update
       @document.name = document_type.name unless @document.name == document_type.name
     end
 
-    # we should probably only be running the crawler if the URL or XPath have changed
-    run_crawler = @document.saved_changes.keys.any? { |attribute| %w[url xpath crawler_server].include? attribute }
-    crawl_result = perform_crawl if run_crawler
+    # we should probably only be running the crawler if the URL or css selector have changed
+    run_crawler = @document.saved_changes.keys.any? { |attribute| %w[url selector].include? attribute }
 
-    if @document.save
-      # only want to do this if XPath or URL have changed
-      ## text is returned blank when there's a defunct URL or XPath
-      ### avoids server error upon 404 error in the crawler
-      # need to alert people if the crawler wasn't able to retrieve any text...
-      unless crawl_result.nil?
-        if crawl_result['error']
-          flash[:alert] = crawler_error_message(crawl_result)
-        else
-          flash[:notice] = 'The crawler has updated the document'
-        end
-      end
+    if run_crawler
+      request = build_request(@document.url, @uri, @document.selector)
+      results = fetch_text(request, @uri, @document)
+
+      @document = results[:document]
+      message = results[:message]
+      crawl_sucessful = results[:crawl_sucessful]
+    end
+
+    if crawl_sucessful && @document.save
+      flash[:notice] = 'Document updated!'
       redirect_to document_path(@document)
     else
-      render 'edit', locals: { crawlers: PROD_CRAWLERS }
+      message ||= 'Document failed to update'
+      flash.now[:warning] = message
+      render 'edit'
     end
   end
 
@@ -127,12 +114,26 @@ def show
 
   def crawl
     authorize @document
-    crawl_result = perform_crawl
-    if crawl_result['error']
-      flash[:alert] = crawler_error_message(crawl_result)
+
+    old_text = @document.text
+    request = build_request(@document.url, @uri, @document.selector)
+    results = fetch_text(request, @uri, @document)
+
+    @document = results[:document]
+    message = results[:message]
+    crawl_sucessful = results[:crawl_sucessful]
+
+    text_changed = old_text != @document.text
+
+    if crawl_sucessful && text_changed && @document.save
+      flash[:notice] = 'Crawl successful. Document text updated!'
+    elsif crawl_sucessful && !text_changed
+      flash[:notice] = 'Crawl successful. Document text unchanged.'
     else
-      flash[:notice] = 'The crawler has updated the document'
+      message ||= 'Crawl failed!'
+      flash.now[:warning] = message
     end
+
     redirect_to document_path(@document)
   end
 
@@ -168,92 +169,60 @@ def set_document_names
     @document_names = DocumentType.where(status: 'approved').order('name ASC')
   end
 
-  def set_crawlers
-    @crawlers = Rails.env.development? ? DEV_CRAWLERS : PROD_CRAWLERS
-  end
-
   def document_params
-    params.require(:document).permit(:service, :service_id, :user_id, :document_type_id, :name, :url, :xpath, :crawler_server)
+    params.require(:document).permit(:service, :service_id, :user_id, :document_type_id, :name, :url, :selector)
   end
 
-  def crawler_error_message(result)
-    message = result['message']['name'].to_s
-    region = result['message']['crawler'].to_s
-    stacktrace = CGI::escapeHTML(result['message']['remoteStacktrace'].to_s)
-
-    `It seems that our crawler wasn't able to retrieve any text. <br><br>Reason: #{message} <br>Region: #{region} <br>Stacktrace: #{stacktrace}`
+  def set_uri
+    url = ENV['OTA_URL']
+    @uri = URI(url)
   end
 
-  # to-do: refactor out comment assembly
-  def perform_crawl
-    authorize @document
-    @tbdoc = TOSBackDoc.new({
-                              url: @document.url,
-                              xpath: @document.xpath,
-                              server: @document.crawler_server
-                            })
-
-    @tbdoc.scrape
-    @document_comment = DocumentComment.new
-
-    error = @tbdoc.apiresponse['error']
-    if error
-      message_name = @tbdoc.apiresponse['message']['name'] || ''
-      crawler = @tbdoc.apiresponse['message']['crawler'] || ''
-      stacktrace = @tbdoc.apiresponse['message']['remoteStacktrace'] || ''
-      @document_comment.summary = '<span class="label label-danger">Attempted to Crawl Document</span><br>Error Message: <kbd>' + message_name + '</kbd><br>Crawler: <kbd>' + crawler + '</kbd><br>Stacktrace: <kbd>' + stacktrace + '</kbd>'
-      @document_comment.user_id = current_user.id
-      @document_comment.document_id = @document.id
-    end
+  def build_request(document_url, uri, selector)
+    request = Net::HTTP::Post.new(uri)
+    params = '{"fetch": "' + document_url + '","select": "' + selector + '"}'
+    request.body = params
+    request.content_type = 'application/json'
+    token = ENV['OTA_API_SECRET']
+    request['Authorization'] = "Bearer #{token}"
 
-    document_blank = [email protected]?
-    old_length = document_blank ? @document.text.length : 0
-    old_crc = document_blank ? Zlib.crc32(@document.text) : 0
-    new_crc = Zlib.crc32(@tbdoc.newdata)
-    changes_made = old_crc != new_crc
-
-    if changes_made
-      @document.update(text: @tbdoc.newdata)
-      new_length = @document.text ? @document.text.length : 'no text retrieved by crawler'
-
-      # There is a cron job in the crontab of the 'tosdr' user on the forum.tosdr.org
-      # server which runs once a day and before it deploys the site from edit.tosdr.org
-      # to tosdr.org, it will run the check_quotes script from
-      # https://github.com/tosdr/tosback-crawler/blob/225a74b/src/eto-admin.js#L121-L123
-      # So that if text has moved without changing, points are updated to the corrected
-      # quote_start, quote_end, and quote_text values where possible, and/or their status is
-      # switched between:
-      # pending <-> pending-not-found
-      # approved <-> approved-not-found
-      crawler = @tbdoc.apiresponse['message']['crawler'] || ''
-      @document_comment.summary = '<span class="label label-info">Document has been crawled</span><br><b>Old length:</b> <kbd>' + old_length.to_s + ' CRC ' + old_crc.to_s + '</kbd><br><b>New length:</b> <kbd>' + new_length.to_s + ' CRC ' + new_crc.to_s + '</kbd><br> Crawler: <kbd>' + crawler + '</kbd>'
-      @document_comment.user_id = current_user.id
-      @document_comment.document_id = @document.id
-    end
-
-    unless changes_made
-      @tbdoc.apiresponse['error'] = true
-      @tbdoc.apiresponse['message'] = {
-        'name' => 'The source document has not been updated. No changes made.',
-        'remoteStacktrace' => 'SourceDocument'
-      }
-    end
+    request
+  end
 
-    message_name = @tbdoc.apiresponse['message']['name'] || ''
-    crawler = @tbdoc.apiresponse['message']['crawler'] || ''
-    stacktrace = @tbdoc.apiresponse['message']['remoteStacktrace'] || ''
+  def fetch_text(request, uri, document)
+    crawl_sucessful = false
 
-    @document_comment.summary = '<span class="label label-danger">Attempted to Crawl Document</span><br>Error Message: <kbd>' + message_name + '</kbd><br>Crawler: <kbd>' + crawler + '</kbd><br>Stacktrace: <kbd>' + stacktrace + '</kbd>'
-    @document_comment.user_id = current_user.id
-    @document_comment.document_id = @document.id
+    begin
+      response_text = Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https') do |http|
+        http.request(request)
+      end
 
-    if @document_comment.save
-      puts 'Comment added!'
-    else
-      puts 'Error adding comment!'
-      puts @document_comment.errors.full_messages
+      case response_text
+      when Net::HTTPSuccess
+        puts 'HTTP Success'
+        response_body = response_text.body
+        parsed_response_body = JSON.parse(response_body)
+        document.text = parsed_response_body
+        crawl_sucessful = true
+        message = 'Document created!'
+      else
+        Rails.logger.error("HTTP Error: #{response.code} - #{response.message}")
+        message = "HTTP Error: Could not retrieve document text. Contact <a href='mailto:[email protected]'>[email protected]</a>. Details: #{response.code} - #{response.message}"
+      end
+    rescue SocketError => e
+      # Handle network-related errors
+      Rails.logger.error("Network Error: #{e.message}")
+      message = "Network Error: Crawler unreachable. Could not retrieve document text. Contact <a href='mailto:[email protected]'>[email protected]</a>. Details: #{e.message}"
+    rescue Timeout::Error => e
+      # Handle timeout errors
+      Rails.logger.error("Timeout Error: #{e.message}")
+      message = "Timeout Error:  Could not retrieve document text. Contact <a href='mailto:[email protected]'>[email protected]</a>. Details: #{e.message}"
+    rescue StandardError => e
+      # Handle any other standard errors
+      Rails.logger.error("Standard Error: #{e.message}")
+      message = "Standard Error: Could not retrieve document text. Is the crawler running? Contact <a href='mailto:[email protected]'>[email protected]</a>. Details: #{e.message}"
     end
 
-    @tbdoc.apiresponse
+    { document: document, message: message, crawl_sucessful: crawl_sucessful }
   end
 end
diff --git a/app/models/document.rb b/app/models/document.rb
@@ -11,16 +11,18 @@ class Document < ApplicationRecord
   has_many :points
   has_many :document_comments, dependent: :destroy
 
+  validates :document_type_id, presence: true
   validates :name, presence: true
-  validates :url, presence: true
   validates :service_id, presence: true
-  validates :document_type_id, presence: true
+  validates :text, presence: true
+  validates :url, presence: true
+  validates :selector, presence: true
 
   validate :location_uniqueness_check
   validate :document_type_uniqueness_check
 
   def location_uniqueness_check
-    doc = Document.where(url: url, xpath: xpath, status: nil)
+    doc = Document.where(url: url, selector: selector, status: nil)
 
     return unless doc.any? && (doc.first.id != id)
 
@@ -82,7 +84,6 @@ def fetch_ota_text
     self.text = document_html
     self.ota_sourced = true
     self.url = document_ota_url
-    self.crawler_server = nil
     save
   end
 

diff --git a/app/views/documents/_form.html.erb b/app/views/documents/_form.html.erb
@@ -10,11 +10,9 @@
 
     <%= f.association :document_type, collection: document_names, hint: "Inspect the document types and their descriptions #{link_to 'here', document_types_path}".html_safe %>
 
-    <%= f.input :url, hint: "The web location at which we can fetch the text of this document", placeholder: "e.g. \"https://www.facebook.com/about/privacy\"" %>
+    <%= f.input :url, hint: "The web location where we can fetch the text of this document", placeholder: "e.g. \"https://www.facebook.com/about/privacy\"" %>
 
-    <%= f.input :xpath, input_html: { value: (f.object.xpath.present?) ? f.object.xpath : '//body' }, placeholder: "e.g. \"//*[@id='content']//div[@class='_5tkp _3m9']\"", hint: "The location of the terms on the web page using <a href=\"https://en.wikipedia.org/wiki/XPath\" title=\"Wikipedia explanation of XPath\" target=\"_blank\">XPath</a>".html_safe %>
-
-    <%= f.input :crawler_server, collection: crawlers, label_method: :second, value_method: :first, hint: "<a href=\"https://to.tosdr.org/U98u1\">Select which crawler should be used (optional, useful if blocked by EU)</a>".html_safe, selected: document.crawler_server || "eu.crawler.api.tosdr.org" %>
+    <%= f.input :selector, input_html: { value: (f.object.selector.present?) ? f.object.selector : 'body' }, placeholder: "e.g. \".products>a:nth-of-type(2)\"", hint: "The location of the terms on the web page using <a href=\"https://www.w3schools.com/cssref/css_selectors.php\" title=\"W3schools documentation of selectors\" target=\"_blank\">CSS selectors</a> - <a href=\"https://scrapfly.io/blog/css-selector-cheatsheet/\" title=\"CSS selectors cheat sheet\" target=\"_blank\">CSS selectors cheat sheet</a>".html_safe %>
   </span>
 
   <%= f.submit "Crawl Document", name: "only_create", class: 'btn btn-default lighter' %>

diff --git a/app/views/documents/edit.html.erb b/app/views/documents/edit.html.erb
@@ -32,7 +32,7 @@
       </div>
       <div class="panel-body">
         <br>
-          <%= render 'form', document: @document, document_names: @document_names, crawlers: @crawlers, services: @services %>
+          <%= render 'form', document: @document, document_names: @document_names, services: @services %>
         <br>
       </div>
     </div>

diff --git a/app/views/documents/new.html.erb b/app/views/documents/new.html.erb
@@ -32,7 +32,7 @@
       </div>
       <div class="panel-body">
         <br>
-          <%= render 'form', document: @document, document_names: @document_names, crawlers: @crawlers, services: @services %>
+          <%= render 'form', document: @document, document_names: @document_names, services: @services %>
         <br>
       </div>
     </div>
-Original file line number
+Diff line change
@@ Expand Up / @@ -32,7 +32,7 @@ @@
           </div>
           <div class="panel-body">
             <br>
-              <%= render 'form', document: @document, document_names: @document_names, crawlers: @crawlers, services: @services %>
+              <%= render 'form', document: @document, document_names: @document_names, services: @services %>
             <br>
           </div>
         </div>
@@ Expand Down @@