Adds support to ScreenshotsAPI and small improvements

proxycrawl · proxycrawl · commit 715d66288eb2 · 2021-07-07T07:22:31.000+03:00
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -1,6 +1,6 @@
 The MIT License (MIT)
 
-Copyright (c) 2020 ProxyCrawl
+Copyright (c) 2021 ProxyCrawl
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
@@ -174,6 +174,57 @@ end
 
 If you have questions or need help using the library, please open an issue or [contact us](https://proxycrawl.com/contact).
 
+
+## Screenshots API usage
+
+Initialize with your Screenshots API token and call the `get` method.
+
+```ruby
+screenshots_api = ProxyCrawl::ScreenshotsAPI.new(token: 'YOUR_TOKEN')
+
+begin
+  response = screenshots_api.get('https://www.apple.com')
+  puts response.status_code
+  puts response.screenshot_path # do something with screenshot_path here
+rescue => exception
+  puts exception.backtrace
+end
+```
+
+or with using a block
+
+```ruby
+screenshots_api = ProxyCrawl::ScreenshotsAPI.new(token: 'YOUR_TOKEN')
+
+begin
+  response = screenshots_api.get('https://www.apple.com') do |file|
+    # do something (reading/writing) with the image file here
+  end
+  puts response.status_code
+rescue => exception
+  puts exception.backtrace
+end
+```
+
+or specifying a file path
+
+```ruby
+screenshots_api = ProxyCrawl::ScreenshotsAPI.new(token: 'YOUR_TOKEN')
+
+begin
+  response = screenshots_api.get('https://www.apple.com', save_to_path: '~/screenshot.jpg') do |file|
+    # do something (reading/writing) with the image file here
+  end
+  puts response.status_code
+rescue => exception
+  puts exception.backtrace
+end
+```
+
+Note that `screenshots_api.get(url, options)` method accepts an [options](https://proxycrawl.com/docs/screenshots-api/parameters)
+
+If you have questions or need help using the library, please open an issue or [contact us](https://proxycrawl.com/contact).
+
 ## Development
 
 After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
diff --git a/lib/proxycrawl.rb b/lib/proxycrawl.rb
@@ -4,6 +4,7 @@
 require 'proxycrawl/api'
 require 'proxycrawl/scraper_api'
 require 'proxycrawl/leads_api'
+require 'proxycrawl/screenshots_api'
 
 module ProxyCrawl
 end
diff --git a/lib/proxycrawl/api.rb b/lib/proxycrawl/api.rb
@@ -70,15 +70,19 @@ def prepare_uri(url, options)
 
     def prepare_response(response, format)
       if format == 'json' || base_url.include?('/scraper')
+        json_body = JSON.parse(response.body)
+        @original_status = json_body['original_status'].to_i
+        @pc_status = json_body['pc_status'].to_i
+        @url = json_body['url']
         @status_code = response.code.to_i
-        @body = response.body
       else
         @original_status = response['original_status'].to_i
         @status_code = response.code.to_i
         @pc_status = response['pc_status'].to_i
         @url = response['url']
-        @body = response.body
       end
+
+      @body = response.body
     end
   end
 end
diff --git a/lib/proxycrawl/scraper_api.rb b/lib/proxycrawl/scraper_api.rb
@@ -2,13 +2,20 @@
 
 module ProxyCrawl
   class ScraperAPI < ProxyCrawl::API
+    attr_reader :remaining_requests
 
     def post
       raise 'Only GET is allowed for the ScraperAPI'
     end
 
     private
 
+    def prepare_response(response, format)
+      super(response, format)
+      json_body = JSON.parse(response.body)
+      @remaining_requests = json_body['remaining_requests'].to_i
+    end
+
     def base_url
       'https://api.proxycrawl.com/scraper'
     end
diff --git a/lib/proxycrawl/screenshots_api.rb b/lib/proxycrawl/screenshots_api.rb
@@ -0,0 +1,52 @@
+# frozen_string_literal: true
+
+require 'securerandom'
+require 'tmpdir'
+
+module ProxyCrawl
+  class ScreenshotsAPI < ProxyCrawl::API
+    attr_reader :screenshot_path, :success, :remaining_requests, :screenshot_url
+
+    INVALID_SAVE_TO_PATH_FILENAME = 'Filename must end with .jpg or .jpeg'
+    SAVE_TO_PATH_FILENAME_PATTERN = /.+\.(jpg|JPG|jpeg|JPEG)$/.freeze
+
+    def post
+      raise 'Only GET is allowed for the ScreenshotsAPI'
+    end
+
+    def get(url, options = {})
+      screenshot_path = options.delete(:save_to_path) || generate_file_path
+      raise INVALID_SAVE_TO_PATH_FILENAME unless SAVE_TO_PATH_FILENAME_PATTERN =~ screenshot_path
+
+      response = super(url, options)
+      file = File.open(screenshot_path, 'w+')
+      file.write(response.body&.force_encoding('UTF-8'))
+      @screenshot_path = screenshot_path
+      yield(file) if block_given?
+      response
+    ensure
+      file&.close
+    end
+
+    private
+
+    def prepare_response(response, format)
+      super(response, format)
+      @remaining_requests = response['remaining_requests'].to_i
+      @success = response['success'] == 'true'
+      @screenshot_url = response['screenshot_url']
+    end
+
+    def base_url
+      'https://api.proxycrawl.com/screenshots'
+    end
+
+    def generate_file_name
+      "#{SecureRandom.urlsafe_base64}.jpg"
+    end
+
+    def generate_file_path
+      File.join(Dir.tmpdir, generate_file_name)
+    end
+  end
+end
diff --git a/spec/api_spec.rb b/spec/api_spec.rb
@@ -2,11 +2,11 @@
 require 'proxycrawl'
 
 describe ProxyCrawl::API do
-  it "raises an error if token is missing" do
+  it 'raises an error if token is missing' do
     expect { ProxyCrawl::API.new }.to raise_error(RuntimeError, 'Token is required')
   end
 
-  it "sets/reads token" do
+  it 'sets/reads token' do
     expect(ProxyCrawl::API.new(token: 'test').token).to eql('test')
   end
 
@@ -20,7 +20,7 @@
 
       api = ProxyCrawl::API.new(token: 'test')
 
-      response = api.get("http://httpbin.org/anything?param1=x&params2=y")
+      response = api.get('http://httpbin.org/anything?param1=x&params2=y')
 
       expect(response.status_code).to eql(200)
       expect(response.original_status).to eql(200)
@@ -70,4 +70,4 @@
     end
   end
 
-end
+end
diff --git a/spec/screenshots_api_spec.rb b/spec/screenshots_api_spec.rb
@@ -0,0 +1,79 @@
+require 'spec_helper.rb'
+require 'proxycrawl'
+
+describe ProxyCrawl::ScreenshotsAPI do
+  it 'raises an error if token is missing' do
+    expect { ProxyCrawl::ScreenshotsAPI.new }.to raise_error(RuntimeError, 'Token is required')
+  end
+
+  it 'sets/reads token' do
+    expect(ProxyCrawl::ScreenshotsAPI.new(token: 'test').token).to eql('test')
+  end
+
+  describe '#get' do
+    before(:each) do
+      stub_request(:get, 'https://api.proxycrawl.com/screenshots?token=test&url=http%3A%2F%2Fhttpbin.org%2Fanything%3Fparam1%3Dx%26params2%3Dy').
+        to_return(
+          body: 'body',
+          status: 200,
+          headers: { skip_normalize: true, 'original_status' => 200, 'pc_status' => 200, 'url' => 'http://httpbin.org/anything?param1=x&params2=y'})
+    end
+
+    it 'sends an get request to ProxyCrawl Screenshots API' do
+      api = ProxyCrawl::ScreenshotsAPI.new(token: 'test')
+
+      response = api.get("http://httpbin.org/anything?param1=x&params2=y")
+
+      expect(response.status_code).to eql(200)
+      expect(response.original_status).to eql(200)
+      expect(response.pc_status).to eql(200)
+      expect(response.url).to eql('http://httpbin.org/anything?param1=x&params2=y')
+      expect(response.body).to eql('body')
+      expect(response.screenshot_path).not_to be_empty
+    end
+
+    it 'accepts a valid save_to_path option' do
+      api = ProxyCrawl::ScreenshotsAPI.new(token: 'test')
+
+      response = api.get("http://httpbin.org/anything?param1=x&params2=y", save_to_path: save_to_path)
+
+      expect(response.status_code).to eql(200)
+      expect(response.original_status).to eql(200)
+      expect(response.pc_status).to eql(200)
+      expect(response.url).to eql('http://httpbin.org/anything?param1=x&params2=y')
+      expect(response.body).to eql('body')
+      expect(response.screenshot_path).to eql(File.join(Dir.tmpdir, 'test-image.jpg'))
+    end
+
+    it 'rejects an invalid save_to_path option' do
+      api = ProxyCrawl::ScreenshotsAPI.new(token: 'test')
+
+      expect { api.get("http://httpbin.org/anything?param1=x&params2=y", save_to_path: '~/images/image_filename.png') }.to raise_error(RuntimeError, 'Filename must end with .jpg or .jpeg')
+      expect { api.get("http://httpbin.org/anything?param1=x&params2=y", save_to_path: 'image_filename.png') }.to raise_error(RuntimeError, 'Filename must end with .jpg or .jpeg')
+      expect { api.get("http://httpbin.org/anything?param1=x&params2=y", save_to_path: '~/images/image_filename') }.to raise_error(RuntimeError, 'Filename must end with .jpg or .jpeg')
+      expect { api.get("http://httpbin.org/anything?param1=x&params2=y", save_to_path: 'image_filename') }.to raise_error(RuntimeError, 'Filename must end with .jpg or .jpeg')
+    end
+
+    it 'accepts a block' do
+      api = ProxyCrawl::ScreenshotsAPI.new(token: 'test')
+
+      response = api.get("http://httpbin.org/anything?param1=x&params2=y", save_to_path: save_to_path) do |file|
+        expect(file).to be_kind_of(File)
+        expect(file.path).to eql(File.join(Dir.tmpdir, 'test-image.jpg'))
+      end
+
+      expect(response.status_code).to eql(200)
+      expect(response.original_status).to eql(200)
+      expect(response.pc_status).to eql(200)
+      expect(response.url).to eql('http://httpbin.org/anything?param1=x&params2=y')
+      expect(response.body).to eql('body')
+      expect(response.screenshot_path).to eql(File.join(Dir.tmpdir, 'test-image.jpg'))
+    end
+  end
+
+  private
+
+  def save_to_path
+    File.join(Dir.tmpdir, 'test-image.jpg')
+  end
+end