diff --git a/Dockerfile b/Dockerfile
index 6490ddea..bcacfb0f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,12 +13,14 @@ COPY . .
RUN apt-get update -qq && apt-get install -y build-essential libpq-dev postgresql postgresql-contrib openssl sudo && \
curl -sS http://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - && \
echo "deb http://dl.yarnpkg.com/debian/ stable main" | tee /etc/apt/sources.list.d/yarn.list && \
- curl -sL https://deb.nodesource.com/setup_12.x | bash - && \
+ curl -sL https://deb.nodesource.com/setup_16.x | bash - && \
apt-get update -qq && apt-get install -y yarn nodejs && \
apt clean && \
rm -rf /var/lib/apt/lists/* && \
yarn
+RUN apt-get update && apt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libasound2
+
RUN gem install bundler -v 2.4.14
COPY Gemfile Gemfile.lock ./
diff --git a/README.md b/README.md
index cc62cdaa..443ab8a1 100644
--- a/README.md
+++ b/README.md
@@ -93,6 +93,11 @@ So,
5. Sign-in
+6. To debug the db, try `docker exec -it db psql -U postgres`. Due to a bug in the seeds you will currently need to:
+```
+insert into document_types values (0, 'terms of service', now(), now(), null, 1, 'approved');
+update documents set url='http://example.com', selector='body';
+```
To **annotate** a service, navigate to the services page from the top-right menu, choose a service, and click `View Documents`. Begin by highlighting a piece of text from this page. **H and the Hypothesis client must be running.**
For a demonstration of how annotations work, feel free to [inspect the video attached to this PR](https://github.com/tosdr/edit.tosdr.org/pull/1116).
diff --git a/app/assets/stylesheets/components/_alert.scss b/app/assets/stylesheets/components/_alert.scss
index 33aa733d..238513d6 100644
--- a/app/assets/stylesheets/components/_alert.scss
+++ b/app/assets/stylesheets/components/_alert.scss
@@ -4,12 +4,4 @@
.alert {
margin: -10px 0 10px;
- text-align: center;
- color: white;
-}
-.alert-info {
- background: $green;
-}
-.alert-warning {
- background: $red;
}
diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb
index 97ce5024..3f395203 100644
--- a/app/controllers/application_controller.rb
+++ b/app/controllers/application_controller.rb
@@ -9,6 +9,8 @@ class ApplicationController < ActionController::Base
before_action :configure_permitted_parameters, if: :devise_controller?
before_action :set_paper_trail_whodunnit
+ add_flash_types :info, :error, :warning
+
def configure_permitted_parameters
# For additional in app/views/devise/registrations/edit.html.erb
devise_parameter_sanitizer.permit(:account_update, keys: [:username])
diff --git a/app/controllers/documents_controller.rb b/app/controllers/documents_controller.rb
index acf21832..7f85d50e 100644
--- a/app/controllers/documents_controller.rb
+++ b/app/controllers/documents_controller.rb
@@ -10,26 +10,11 @@
class DocumentsController < ApplicationController
include Pundit::Authorization
- PROD_CRAWLERS = {
- "https://api.tosdr.org/crawl/v1": 'Random',
- "https://api.tosdr.org/crawl/v1/eu": 'Europe (Recommended)',
- "https://api.tosdr.org/crawl/v1/us": 'United States (Recommended)',
- "https://api.tosdr.org/crawl/v1/eu-central": 'Europe (Central)',
- "https://api.tosdr.org/crawl/v1/eu-west": 'Europe (West)',
- "https://api.tosdr.org/crawl/v1/us-east": 'United States (East)',
- "https://api.tosdr.org/crawl/v1/us-west": 'United States (West)'
- }.freeze
-
- DEV_CRAWLERS = {
- "http://localhost:5000": 'Standalone (localhost:5000)',
- "http://crawler:5000": 'Docker-Compose (crawler:5000)'
- }.freeze
-
before_action :authenticate_user!, except: %i[index show]
before_action :set_document, only: %i[show edit update crawl restore_points]
before_action :set_services, only: %i[new edit create update]
before_action :set_document_names, only: %i[new edit create update]
- before_action :set_crawlers, only: %i[new edit create update]
+ before_action :set_uri, only: %i[new edit create update crawl]
rescue_from Pundit::NotAuthorizedError, with: :user_not_authorized
@@ -55,18 +40,20 @@ def create
@document.user = current_user
@document.name = @document.document_type.name if @document.document_type
+ document_url = document_params[:url]
+ selector = document_params[:selector]
+
+ request = build_request(document_url, @uri, selector)
+ results = fetch_text(request, @uri, @document)
+
+ @document = results[:document]
+ message = results[:message]
+
if @document.save
- crawl_result = perform_crawl
-
- unless crawl_result.nil?
- if crawl_result['error']
- flash[:alert] = crawler_error_message(crawl_result)
- else
- flash[:notice] = 'The crawler has updated the document'
- end
- end
+ flash[:notice] = message
redirect_to document_path(@document)
else
+ flash.now[:warning] = message.html_safe if message
render :new
end
end
@@ -82,25 +69,25 @@ def update
@document.name = document_type.name unless @document.name == document_type.name
end
- # we should probably only be running the crawler if the URL or XPath have changed
- run_crawler = @document.saved_changes.keys.any? { |attribute| %w[url xpath crawler_server].include? attribute }
- crawl_result = perform_crawl if run_crawler
+ # we should probably only be running the crawler if the URL or css selector have changed
+ run_crawler = @document.saved_changes.keys.any? { |attribute| %w[url selector].include? attribute }
- if @document.save
- # only want to do this if XPath or URL have changed
- ## text is returned blank when there's a defunct URL or XPath
- ### avoids server error upon 404 error in the crawler
- # need to alert people if the crawler wasn't able to retrieve any text...
- unless crawl_result.nil?
- if crawl_result['error']
- flash[:alert] = crawler_error_message(crawl_result)
- else
- flash[:notice] = 'The crawler has updated the document'
- end
- end
+ if run_crawler
+ request = build_request(@document.url, @uri, @document.selector)
+ results = fetch_text(request, @uri, @document)
+
+ @document = results[:document]
+ message = results[:message]
+ crawl_sucessful = results[:crawl_sucessful]
+ end
+
+ if crawl_sucessful && @document.save
+ flash[:notice] = 'Document updated!'
redirect_to document_path(@document)
else
- render 'edit', locals: { crawlers: PROD_CRAWLERS }
+ message ||= 'Document failed to update'
+ flash.now[:warning] = message
+ render 'edit'
end
end
@@ -127,12 +114,26 @@ def show
def crawl
authorize @document
- crawl_result = perform_crawl
- if crawl_result['error']
- flash[:alert] = crawler_error_message(crawl_result)
+
+ old_text = @document.text
+ request = build_request(@document.url, @uri, @document.selector)
+ results = fetch_text(request, @uri, @document)
+
+ @document = results[:document]
+ message = results[:message]
+ crawl_sucessful = results[:crawl_sucessful]
+
+ text_changed = old_text != @document.text
+
+ if crawl_sucessful && text_changed && @document.save
+ flash[:notice] = 'Crawl successful. Document text updated!'
+ elsif crawl_sucessful && !text_changed
+ flash[:notice] = 'Crawl successful. Document text unchanged.'
else
- flash[:notice] = 'The crawler has updated the document'
+ message ||= 'Crawl failed!'
+ flash.now[:warning] = message
end
+
redirect_to document_path(@document)
end
@@ -168,92 +169,60 @@ def set_document_names
@document_names = DocumentType.where(status: 'approved').order('name ASC')
end
- def set_crawlers
- @crawlers = Rails.env.development? ? DEV_CRAWLERS : PROD_CRAWLERS
- end
-
def document_params
- params.require(:document).permit(:service, :service_id, :user_id, :document_type_id, :name, :url, :xpath, :crawler_server)
+ params.require(:document).permit(:service, :service_id, :user_id, :document_type_id, :name, :url, :selector)
end
- def crawler_error_message(result)
- message = result['message']['name'].to_s
- region = result['message']['crawler'].to_s
- stacktrace = CGI::escapeHTML(result['message']['remoteStacktrace'].to_s)
-
- `It seems that our crawler wasn't able to retrieve any text.
Reason: #{message}
Region: #{region}
Stacktrace: #{stacktrace}`
+ def set_uri
+ url = ENV['OTA_URL']
+ @uri = URI(url)
end
- # to-do: refactor out comment assembly
- def perform_crawl
- authorize @document
- @tbdoc = TOSBackDoc.new({
- url: @document.url,
- xpath: @document.xpath,
- server: @document.crawler_server
- })
-
- @tbdoc.scrape
- @document_comment = DocumentComment.new
-
- error = @tbdoc.apiresponse['error']
- if error
- message_name = @tbdoc.apiresponse['message']['name'] || ''
- crawler = @tbdoc.apiresponse['message']['crawler'] || ''
- stacktrace = @tbdoc.apiresponse['message']['remoteStacktrace'] || ''
- @document_comment.summary = 'Attempted to Crawl Document
Error Message: ' + message_name + '
Crawler: ' + crawler + '
Stacktrace: ' + stacktrace + ''
- @document_comment.user_id = current_user.id
- @document_comment.document_id = @document.id
- end
+ def build_request(document_url, uri, selector)
+ request = Net::HTTP::Post.new(uri)
+ params = '{"fetch": "' + document_url + '","select": "' + selector + '"}'
+ request.body = params
+ request.content_type = 'application/json'
+ token = ENV['OTA_API_SECRET']
+ request['Authorization'] = "Bearer #{token}"
- document_blank = !@document.text.blank?
- old_length = document_blank ? @document.text.length : 0
- old_crc = document_blank ? Zlib.crc32(@document.text) : 0
- new_crc = Zlib.crc32(@tbdoc.newdata)
- changes_made = old_crc != new_crc
-
- if changes_made
- @document.update(text: @tbdoc.newdata)
- new_length = @document.text ? @document.text.length : 'no text retrieved by crawler'
-
- # There is a cron job in the crontab of the 'tosdr' user on the forum.tosdr.org
- # server which runs once a day and before it deploys the site from edit.tosdr.org
- # to tosdr.org, it will run the check_quotes script from
- # https://github.com/tosdr/tosback-crawler/blob/225a74b/src/eto-admin.js#L121-L123
- # So that if text has moved without changing, points are updated to the corrected
- # quote_start, quote_end, and quote_text values where possible, and/or their status is
- # switched between:
- # pending <-> pending-not-found
- # approved <-> approved-not-found
- crawler = @tbdoc.apiresponse['message']['crawler'] || ''
- @document_comment.summary = 'Document has been crawled
Old length: ' + old_length.to_s + ' CRC ' + old_crc.to_s + '
New length: ' + new_length.to_s + ' CRC ' + new_crc.to_s + '
Crawler: ' + crawler + ''
- @document_comment.user_id = current_user.id
- @document_comment.document_id = @document.id
- end
-
- unless changes_made
- @tbdoc.apiresponse['error'] = true
- @tbdoc.apiresponse['message'] = {
- 'name' => 'The source document has not been updated. No changes made.',
- 'remoteStacktrace' => 'SourceDocument'
- }
- end
+ request
+ end
- message_name = @tbdoc.apiresponse['message']['name'] || ''
- crawler = @tbdoc.apiresponse['message']['crawler'] || ''
- stacktrace = @tbdoc.apiresponse['message']['remoteStacktrace'] || ''
+ def fetch_text(request, uri, document)
+ crawl_sucessful = false
- @document_comment.summary = 'Attempted to Crawl Document
Error Message: ' + message_name + '
Crawler: ' + crawler + '
Stacktrace: ' + stacktrace + ''
- @document_comment.user_id = current_user.id
- @document_comment.document_id = @document.id
+ begin
+ response_text = Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https') do |http|
+ http.request(request)
+ end
- if @document_comment.save
- puts 'Comment added!'
- else
- puts 'Error adding comment!'
- puts @document_comment.errors.full_messages
+ case response_text
+ when Net::HTTPSuccess
+ puts 'HTTP Success'
+ response_body = response_text.body
+ parsed_response_body = JSON.parse(response_body)
+ document.text = parsed_response_body
+ crawl_sucessful = true
+ message = 'Document created!'
+ else
+ Rails.logger.error("HTTP Error: #{response.code} - #{response.message}")
+ message = "HTTP Error: Could not retrieve document text. Contact team@tosdr.org. Details: #{response.code} - #{response.message}"
+ end
+ rescue SocketError => e
+ # Handle network-related errors
+ Rails.logger.error("Network Error: #{e.message}")
+ message = "Network Error: Crawler unreachable. Could not retrieve document text. Contact team@tosdr.org. Details: #{e.message}"
+ rescue Timeout::Error => e
+ # Handle timeout errors
+ Rails.logger.error("Timeout Error: #{e.message}")
+ message = "Timeout Error: Could not retrieve document text. Contact team@tosdr.org. Details: #{e.message}"
+ rescue StandardError => e
+ # Handle any other standard errors
+ Rails.logger.error("Standard Error: #{e.message}")
+ message = "Standard Error: Could not retrieve document text. Is the crawler running? Contact team@tosdr.org. Details: #{e.message}"
end
- @tbdoc.apiresponse
+ { document: document, message: message, crawl_sucessful: crawl_sucessful }
end
end
diff --git a/app/models/document.rb b/app/models/document.rb
index 8119fb93..346dd542 100644
--- a/app/models/document.rb
+++ b/app/models/document.rb
@@ -11,16 +11,18 @@ class Document < ApplicationRecord
has_many :points
has_many :document_comments, dependent: :destroy
+ validates :document_type_id, presence: true
validates :name, presence: true
- validates :url, presence: true
validates :service_id, presence: true
- validates :document_type_id, presence: true
+ validates :text, presence: true
+ validates :url, presence: true
+ validates :selector, presence: true
validate :location_uniqueness_check
validate :document_type_uniqueness_check
def location_uniqueness_check
- doc = Document.where(url: url, xpath: xpath, status: nil)
+ doc = Document.where(url: url, selector: selector, status: nil)
return unless doc.any? && (doc.first.id != id)
@@ -82,7 +84,6 @@ def fetch_ota_text
self.text = document_html
self.ota_sourced = true
self.url = document_ota_url
- self.crawler_server = nil
save
end
diff --git a/app/views/documents/_form.html.erb b/app/views/documents/_form.html.erb
index 376512c6..715791e3 100644
--- a/app/views/documents/_form.html.erb
+++ b/app/views/documents/_form.html.erb
@@ -10,11 +10,9 @@
<%= f.association :document_type, collection: document_names, hint: "Inspect the document types and their descriptions #{link_to 'here', document_types_path}".html_safe %>
- <%= f.input :url, hint: "The web location at which we can fetch the text of this document", placeholder: "e.g. \"https://www.facebook.com/about/privacy\"" %>
+ <%= f.input :url, hint: "The web location where we can fetch the text of this document", placeholder: "e.g. \"https://www.facebook.com/about/privacy\"" %>
- <%= f.input :xpath, input_html: { value: (f.object.xpath.present?) ? f.object.xpath : '//body' }, placeholder: "e.g. \"//*[@id='content']//div[@class='_5tkp _3m9']\"", hint: "The location of the terms on the web page using XPath".html_safe %>
-
- <%= f.input :crawler_server, collection: crawlers, label_method: :second, value_method: :first, hint: "Select which crawler should be used (optional, useful if blocked by EU)".html_safe, selected: document.crawler_server || "eu.crawler.api.tosdr.org" %>
+ <%= f.input :selector, input_html: { value: (f.object.selector.present?) ? f.object.selector : 'body' }, placeholder: "e.g. \".products>a:nth-of-type(2)\"", hint: "The location of the terms on the web page using CSS selectors - CSS selectors cheat sheet".html_safe %>
<%= f.submit "Crawl Document", name: "only_create", class: 'btn btn-default lighter' %>
diff --git a/app/views/documents/edit.html.erb b/app/views/documents/edit.html.erb
index 1695d2f1..78329782 100644
--- a/app/views/documents/edit.html.erb
+++ b/app/views/documents/edit.html.erb
@@ -32,7 +32,7 @@