diff --git a/Dockerfile b/Dockerfile index 6490ddea..bcacfb0f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,12 +13,14 @@ COPY . . RUN apt-get update -qq && apt-get install -y build-essential libpq-dev postgresql postgresql-contrib openssl sudo && \ curl -sS http://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - && \ echo "deb http://dl.yarnpkg.com/debian/ stable main" | tee /etc/apt/sources.list.d/yarn.list && \ - curl -sL https://deb.nodesource.com/setup_12.x | bash - && \ + curl -sL https://deb.nodesource.com/setup_16.x | bash - && \ apt-get update -qq && apt-get install -y yarn nodejs && \ apt clean && \ rm -rf /var/lib/apt/lists/* && \ yarn +RUN apt-get update && apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libasound2 + RUN gem install bundler -v 2.4.14 COPY Gemfile Gemfile.lock ./ diff --git a/README.md b/README.md index cc62cdaa..443ab8a1 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,11 @@ So, 5. Sign-in +6. To debug the db, try `docker exec -it db psql -U postgres`. Due to a bug in the seeds you will currently need to: +``` +insert into document_types values (0, 'terms of service', now(), now(), null, 1, 'approved'); +update documents set url='http://example.com', selector='body'; +``` To **annotate** a service, navigate to the services page from the top-right menu, choose a service, and click `View Documents`. Begin by highlighting a piece of text from this page. **H and the Hypothesis client must be running.** For a demonstration of how annotations work, feel free to [inspect the video attached to this PR](https://github.com/tosdr/edit.tosdr.org/pull/1116). diff --git a/app/assets/stylesheets/components/_alert.scss b/app/assets/stylesheets/components/_alert.scss index 33aa733d..238513d6 100644 --- a/app/assets/stylesheets/components/_alert.scss +++ b/app/assets/stylesheets/components/_alert.scss @@ -4,12 +4,4 @@ .alert { margin: -10px 0 10px; - text-align: center; - color: white; -} -.alert-info { - background: $green; -} -.alert-warning { - background: $red; } diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index 97ce5024..3f395203 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -9,6 +9,8 @@ class ApplicationController < ActionController::Base before_action :configure_permitted_parameters, if: :devise_controller? before_action :set_paper_trail_whodunnit + add_flash_types :info, :error, :warning + def configure_permitted_parameters # For additional in app/views/devise/registrations/edit.html.erb devise_parameter_sanitizer.permit(:account_update, keys: [:username]) diff --git a/app/controllers/documents_controller.rb b/app/controllers/documents_controller.rb index acf21832..7f85d50e 100644 --- a/app/controllers/documents_controller.rb +++ b/app/controllers/documents_controller.rb @@ -10,26 +10,11 @@ class DocumentsController < ApplicationController include Pundit::Authorization - PROD_CRAWLERS = { - "https://api.tosdr.org/crawl/v1": 'Random', - "https://api.tosdr.org/crawl/v1/eu": 'Europe (Recommended)', - "https://api.tosdr.org/crawl/v1/us": 'United States (Recommended)', - "https://api.tosdr.org/crawl/v1/eu-central": 'Europe (Central)', - "https://api.tosdr.org/crawl/v1/eu-west": 'Europe (West)', - "https://api.tosdr.org/crawl/v1/us-east": 'United States (East)', - "https://api.tosdr.org/crawl/v1/us-west": 'United States (West)' - }.freeze - - DEV_CRAWLERS = { - "http://localhost:5000": 'Standalone (localhost:5000)', - "http://crawler:5000": 'Docker-Compose (crawler:5000)' - }.freeze - before_action :authenticate_user!, except: %i[index show] before_action :set_document, only: %i[show edit update crawl restore_points] before_action :set_services, only: %i[new edit create update] before_action :set_document_names, only: %i[new edit create update] - before_action :set_crawlers, only: %i[new edit create update] + before_action :set_uri, only: %i[new edit create update crawl] rescue_from Pundit::NotAuthorizedError, with: :user_not_authorized @@ -55,18 +40,20 @@ def create @document.user = current_user @document.name = @document.document_type.name if @document.document_type + document_url = document_params[:url] + selector = document_params[:selector] + + request = build_request(document_url, @uri, selector) + results = fetch_text(request, @uri, @document) + + @document = results[:document] + message = results[:message] + if @document.save - crawl_result = perform_crawl - - unless crawl_result.nil? - if crawl_result['error'] - flash[:alert] = crawler_error_message(crawl_result) - else - flash[:notice] = 'The crawler has updated the document' - end - end + flash[:notice] = message redirect_to document_path(@document) else + flash.now[:warning] = message.html_safe if message render :new end end @@ -82,25 +69,25 @@ def update @document.name = document_type.name unless @document.name == document_type.name end - # we should probably only be running the crawler if the URL or XPath have changed - run_crawler = @document.saved_changes.keys.any? { |attribute| %w[url xpath crawler_server].include? attribute } - crawl_result = perform_crawl if run_crawler + # we should probably only be running the crawler if the URL or css selector have changed + run_crawler = @document.saved_changes.keys.any? { |attribute| %w[url selector].include? attribute } - if @document.save - # only want to do this if XPath or URL have changed - ## text is returned blank when there's a defunct URL or XPath - ### avoids server error upon 404 error in the crawler - # need to alert people if the crawler wasn't able to retrieve any text... - unless crawl_result.nil? - if crawl_result['error'] - flash[:alert] = crawler_error_message(crawl_result) - else - flash[:notice] = 'The crawler has updated the document' - end - end + if run_crawler + request = build_request(@document.url, @uri, @document.selector) + results = fetch_text(request, @uri, @document) + + @document = results[:document] + message = results[:message] + crawl_sucessful = results[:crawl_sucessful] + end + + if crawl_sucessful && @document.save + flash[:notice] = 'Document updated!' redirect_to document_path(@document) else - render 'edit', locals: { crawlers: PROD_CRAWLERS } + message ||= 'Document failed to update' + flash.now[:warning] = message + render 'edit' end end @@ -127,12 +114,26 @@ def show def crawl authorize @document - crawl_result = perform_crawl - if crawl_result['error'] - flash[:alert] = crawler_error_message(crawl_result) + + old_text = @document.text + request = build_request(@document.url, @uri, @document.selector) + results = fetch_text(request, @uri, @document) + + @document = results[:document] + message = results[:message] + crawl_sucessful = results[:crawl_sucessful] + + text_changed = old_text != @document.text + + if crawl_sucessful && text_changed && @document.save + flash[:notice] = 'Crawl successful. Document text updated!' + elsif crawl_sucessful && !text_changed + flash[:notice] = 'Crawl successful. Document text unchanged.' else - flash[:notice] = 'The crawler has updated the document' + message ||= 'Crawl failed!' + flash.now[:warning] = message end + redirect_to document_path(@document) end @@ -168,92 +169,60 @@ def set_document_names @document_names = DocumentType.where(status: 'approved').order('name ASC') end - def set_crawlers - @crawlers = Rails.env.development? ? DEV_CRAWLERS : PROD_CRAWLERS - end - def document_params - params.require(:document).permit(:service, :service_id, :user_id, :document_type_id, :name, :url, :xpath, :crawler_server) + params.require(:document).permit(:service, :service_id, :user_id, :document_type_id, :name, :url, :selector) end - def crawler_error_message(result) - message = result['message']['name'].to_s - region = result['message']['crawler'].to_s - stacktrace = CGI::escapeHTML(result['message']['remoteStacktrace'].to_s) - - `It seems that our crawler wasn't able to retrieve any text.

Reason: #{message}
Region: #{region}
Stacktrace: #{stacktrace}` + def set_uri + url = ENV['OTA_URL'] + @uri = URI(url) end - # to-do: refactor out comment assembly - def perform_crawl - authorize @document - @tbdoc = TOSBackDoc.new({ - url: @document.url, - xpath: @document.xpath, - server: @document.crawler_server - }) - - @tbdoc.scrape - @document_comment = DocumentComment.new - - error = @tbdoc.apiresponse['error'] - if error - message_name = @tbdoc.apiresponse['message']['name'] || '' - crawler = @tbdoc.apiresponse['message']['crawler'] || '' - stacktrace = @tbdoc.apiresponse['message']['remoteStacktrace'] || '' - @document_comment.summary = 'Attempted to Crawl Document
Error Message: ' + message_name + '
Crawler: ' + crawler + '
Stacktrace: ' + stacktrace + '' - @document_comment.user_id = current_user.id - @document_comment.document_id = @document.id - end + def build_request(document_url, uri, selector) + request = Net::HTTP::Post.new(uri) + params = '{"fetch": "' + document_url + '","select": "' + selector + '"}' + request.body = params + request.content_type = 'application/json' + token = ENV['OTA_API_SECRET'] + request['Authorization'] = "Bearer #{token}" - document_blank = !@document.text.blank? - old_length = document_blank ? @document.text.length : 0 - old_crc = document_blank ? Zlib.crc32(@document.text) : 0 - new_crc = Zlib.crc32(@tbdoc.newdata) - changes_made = old_crc != new_crc - - if changes_made - @document.update(text: @tbdoc.newdata) - new_length = @document.text ? @document.text.length : 'no text retrieved by crawler' - - # There is a cron job in the crontab of the 'tosdr' user on the forum.tosdr.org - # server which runs once a day and before it deploys the site from edit.tosdr.org - # to tosdr.org, it will run the check_quotes script from - # https://github.com/tosdr/tosback-crawler/blob/225a74b/src/eto-admin.js#L121-L123 - # So that if text has moved without changing, points are updated to the corrected - # quote_start, quote_end, and quote_text values where possible, and/or their status is - # switched between: - # pending <-> pending-not-found - # approved <-> approved-not-found - crawler = @tbdoc.apiresponse['message']['crawler'] || '' - @document_comment.summary = 'Document has been crawled
Old length: ' + old_length.to_s + ' CRC ' + old_crc.to_s + '
New length: ' + new_length.to_s + ' CRC ' + new_crc.to_s + '
Crawler: ' + crawler + '' - @document_comment.user_id = current_user.id - @document_comment.document_id = @document.id - end - - unless changes_made - @tbdoc.apiresponse['error'] = true - @tbdoc.apiresponse['message'] = { - 'name' => 'The source document has not been updated. No changes made.', - 'remoteStacktrace' => 'SourceDocument' - } - end + request + end - message_name = @tbdoc.apiresponse['message']['name'] || '' - crawler = @tbdoc.apiresponse['message']['crawler'] || '' - stacktrace = @tbdoc.apiresponse['message']['remoteStacktrace'] || '' + def fetch_text(request, uri, document) + crawl_sucessful = false - @document_comment.summary = 'Attempted to Crawl Document
Error Message: ' + message_name + '
Crawler: ' + crawler + '
Stacktrace: ' + stacktrace + '' - @document_comment.user_id = current_user.id - @document_comment.document_id = @document.id + begin + response_text = Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https') do |http| + http.request(request) + end - if @document_comment.save - puts 'Comment added!' - else - puts 'Error adding comment!' - puts @document_comment.errors.full_messages + case response_text + when Net::HTTPSuccess + puts 'HTTP Success' + response_body = response_text.body + parsed_response_body = JSON.parse(response_body) + document.text = parsed_response_body + crawl_sucessful = true + message = 'Document created!' + else + Rails.logger.error("HTTP Error: #{response.code} - #{response.message}") + message = "HTTP Error: Could not retrieve document text. Contact team@tosdr.org. Details: #{response.code} - #{response.message}" + end + rescue SocketError => e + # Handle network-related errors + Rails.logger.error("Network Error: #{e.message}") + message = "Network Error: Crawler unreachable. Could not retrieve document text. Contact team@tosdr.org. Details: #{e.message}" + rescue Timeout::Error => e + # Handle timeout errors + Rails.logger.error("Timeout Error: #{e.message}") + message = "Timeout Error: Could not retrieve document text. Contact team@tosdr.org. Details: #{e.message}" + rescue StandardError => e + # Handle any other standard errors + Rails.logger.error("Standard Error: #{e.message}") + message = "Standard Error: Could not retrieve document text. Is the crawler running? Contact team@tosdr.org. Details: #{e.message}" end - @tbdoc.apiresponse + { document: document, message: message, crawl_sucessful: crawl_sucessful } end end diff --git a/app/models/document.rb b/app/models/document.rb index 8119fb93..346dd542 100644 --- a/app/models/document.rb +++ b/app/models/document.rb @@ -11,16 +11,18 @@ class Document < ApplicationRecord has_many :points has_many :document_comments, dependent: :destroy + validates :document_type_id, presence: true validates :name, presence: true - validates :url, presence: true validates :service_id, presence: true - validates :document_type_id, presence: true + validates :text, presence: true + validates :url, presence: true + validates :selector, presence: true validate :location_uniqueness_check validate :document_type_uniqueness_check def location_uniqueness_check - doc = Document.where(url: url, xpath: xpath, status: nil) + doc = Document.where(url: url, selector: selector, status: nil) return unless doc.any? && (doc.first.id != id) @@ -82,7 +84,6 @@ def fetch_ota_text self.text = document_html self.ota_sourced = true self.url = document_ota_url - self.crawler_server = nil save end diff --git a/app/views/documents/_form.html.erb b/app/views/documents/_form.html.erb index 376512c6..715791e3 100644 --- a/app/views/documents/_form.html.erb +++ b/app/views/documents/_form.html.erb @@ -10,11 +10,9 @@ <%= f.association :document_type, collection: document_names, hint: "Inspect the document types and their descriptions #{link_to 'here', document_types_path}".html_safe %> - <%= f.input :url, hint: "The web location at which we can fetch the text of this document", placeholder: "e.g. \"https://www.facebook.com/about/privacy\"" %> + <%= f.input :url, hint: "The web location where we can fetch the text of this document", placeholder: "e.g. \"https://www.facebook.com/about/privacy\"" %> - <%= f.input :xpath, input_html: { value: (f.object.xpath.present?) ? f.object.xpath : '//body' }, placeholder: "e.g. \"//*[@id='content']//div[@class='_5tkp _3m9']\"", hint: "The location of the terms on the web page using XPath".html_safe %> - - <%= f.input :crawler_server, collection: crawlers, label_method: :second, value_method: :first, hint: "Select which crawler should be used (optional, useful if blocked by EU)".html_safe, selected: document.crawler_server || "eu.crawler.api.tosdr.org" %> + <%= f.input :selector, input_html: { value: (f.object.selector.present?) ? f.object.selector : 'body' }, placeholder: "e.g. \".products>a:nth-of-type(2)\"", hint: "The location of the terms on the web page using CSS selectors - CSS selectors cheat sheet".html_safe %> <%= f.submit "Crawl Document", name: "only_create", class: 'btn btn-default lighter' %> diff --git a/app/views/documents/edit.html.erb b/app/views/documents/edit.html.erb index 1695d2f1..78329782 100644 --- a/app/views/documents/edit.html.erb +++ b/app/views/documents/edit.html.erb @@ -32,7 +32,7 @@

- <%= render 'form', document: @document, document_names: @document_names, crawlers: @crawlers, services: @services %> + <%= render 'form', document: @document, document_names: @document_names, services: @services %>
diff --git a/app/views/documents/new.html.erb b/app/views/documents/new.html.erb index c84fe33b..a9ab74d0 100644 --- a/app/views/documents/new.html.erb +++ b/app/views/documents/new.html.erb @@ -32,7 +32,7 @@

- <%= render 'form', document: @document, document_names: @document_names, crawlers: @crawlers, services: @services %> + <%= render 'form', document: @document, document_names: @document_names, services: @services %>
diff --git a/app/views/shared/_flashes.html.erb b/app/views/shared/_flashes.html.erb index 3bf462a4..01b8d9b8 100644 --- a/app/views/shared/_flashes.html.erb +++ b/app/views/shared/_flashes.html.erb @@ -1,12 +1,25 @@ <% if notice %> -