|
| 1 | +# frozen_string_literal: true |
| 2 | + |
| 3 | +module Pwb |
| 4 | + # HTTP client for the Property Web Scraper (PWS) external extraction service. |
| 5 | + # Sends URL + HTML to PWS and receives structured {asset_data, listing_data, images}. |
| 6 | + # |
| 7 | + # Usage: |
| 8 | + # result = Pwb::ExternalScraperClient.new(url: "https://...", html: "<html>...").call |
| 9 | + # result.success # => true |
| 10 | + # result.extracted_data # => { "asset_data" => {...}, "listing_data" => {...} } |
| 11 | + # result.extracted_images # => ["https://..."] |
| 12 | + # |
| 13 | + class ExternalScraperClient |
| 14 | + DEFAULT_TIMEOUT = 15 |
| 15 | + |
| 16 | + Result = Struct.new(:success, :extracted_data, :extracted_images, :portal, :extraction_rate, :error, keyword_init: true) |
| 17 | + |
| 18 | + # Base error class |
| 19 | + class Error < StandardError; end |
| 20 | + class UnsupportedPortalError < Error; end |
| 21 | + class ExtractionFailedError < Error; end |
| 22 | + class ConnectionError < Error; end |
| 23 | + |
| 24 | + ERROR_CODE_MAP = { |
| 25 | + "UNSUPPORTED_HOST" => UnsupportedPortalError, |
| 26 | + "EXTRACTION_FAILED" => ExtractionFailedError |
| 27 | + }.freeze |
| 28 | + |
| 29 | + attr_reader :url, :html |
| 30 | + |
| 31 | + def initialize(url:, html:) |
| 32 | + @url = url |
| 33 | + @html = html |
| 34 | + end |
| 35 | + |
| 36 | + # POST to PWS extraction endpoint |
| 37 | + # @return [Result] |
| 38 | + def call |
| 39 | + response = connection.post("/public_api/v1/listings") do |req| |
| 40 | + req.params["format"] = "pwb" |
| 41 | + req.headers["X-Api-Key"] = api_key |
| 42 | + req.headers["Content-Type"] = "application/json" |
| 43 | + req.body = { url: url, html: html }.to_json |
| 44 | + end |
| 45 | + |
| 46 | + parse_response(response) |
| 47 | + rescue Faraday::TimeoutError => e |
| 48 | + raise ConnectionError, "PWS request timed out: #{e.message}" |
| 49 | + rescue Faraday::ConnectionFailed => e |
| 50 | + raise ConnectionError, "PWS connection failed: #{e.message}" |
| 51 | + end |
| 52 | + |
| 53 | + # Check if the external scraper is enabled via ENV |
| 54 | + def self.enabled? |
| 55 | + ENV["PWS_API_URL"].present? && ENV["PWS_ENABLED"] != "false" |
| 56 | + end |
| 57 | + |
| 58 | + # Check if the PWS service is healthy |
| 59 | + # @return [Boolean] |
| 60 | + def self.healthy? |
| 61 | + return false unless enabled? |
| 62 | + |
| 63 | + response = build_connection.get("/public_api/v1/health") |
| 64 | + response.status == 200 |
| 65 | + rescue Faraday::Error |
| 66 | + false |
| 67 | + end |
| 68 | + |
| 69 | + # Fetch supported portals from PWS |
| 70 | + # @return [Array<String>] list of portal identifiers |
| 71 | + def self.supported_portals |
| 72 | + return [] unless enabled? |
| 73 | + |
| 74 | + response = build_connection.get("/public_api/v1/supported_sites") do |req| |
| 75 | + req.headers["X-Api-Key"] = ENV["PWS_API_KEY"] |
| 76 | + end |
| 77 | + |
| 78 | + body = response.body |
| 79 | + body.is_a?(Hash) ? (body["portals"] || body["sites"] || []) : [] |
| 80 | + rescue Faraday::Error |
| 81 | + [] |
| 82 | + end |
| 83 | + |
| 84 | + private |
| 85 | + |
| 86 | + def parse_response(response) |
| 87 | + body = response.body |
| 88 | + |
| 89 | + if body["success"] |
| 90 | + data = body["data"] || {} |
| 91 | + Result.new( |
| 92 | + success: true, |
| 93 | + extracted_data: { |
| 94 | + "asset_data" => data["asset_data"], |
| 95 | + "listing_data" => data["listing_data"] |
| 96 | + }, |
| 97 | + extracted_images: data["images"] || [], |
| 98 | + portal: body["portal"], |
| 99 | + extraction_rate: body["extraction_rate"] |
| 100 | + ) |
| 101 | + else |
| 102 | + error_info = body["error"] || {} |
| 103 | + error_code = error_info["code"] |
| 104 | + error_message = error_info["message"] || "Unknown PWS error" |
| 105 | + |
| 106 | + error_class = ERROR_CODE_MAP[error_code] || Error |
| 107 | + raise error_class, error_message |
| 108 | + end |
| 109 | + end |
| 110 | + |
| 111 | + def connection |
| 112 | + self.class.build_connection |
| 113 | + end |
| 114 | + |
| 115 | + def api_key |
| 116 | + ENV["PWS_API_KEY"] |
| 117 | + end |
| 118 | + |
| 119 | + def self.build_connection |
| 120 | + Faraday.new(url: ENV["PWS_API_URL"]) do |f| |
| 121 | + f.request :json |
| 122 | + f.response :json |
| 123 | + f.adapter Faraday.default_adapter |
| 124 | + f.options.timeout = timeout_seconds |
| 125 | + f.options.open_timeout = 5 |
| 126 | + end |
| 127 | + end |
| 128 | + |
| 129 | + def self.timeout_seconds |
| 130 | + (ENV["PWS_TIMEOUT"] || DEFAULT_TIMEOUT).to_i |
| 131 | + end |
| 132 | + end |
| 133 | +end |
0 commit comments