Skip to content

Commit bc1116b

Browse files
etewiahclaude
andcommitted
Add PWS ExternalScraperClient integration with fallback to local pasarela
Phase 2 of scraper microservice integration. PWB now attempts extraction via the external Property Web Scraper service first, falling back to local pasarela parsing on unsupported portals or connection errors. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1f8a504 commit bc1116b

File tree

9 files changed

+603
-9
lines changed

9 files changed

+603
-9
lines changed

app/models/pwb/scraped_property.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# connector_used :string
1010
# extracted_data :jsonb
1111
# extracted_images :jsonb
12+
# extraction_source :string
1213
# import_status :string default("pending")
1314
# imported_at :datetime
1415
# raw_html :text
@@ -60,6 +61,7 @@ class ScrapedProperty < ApplicationRecord
6061
IMPORT_STATUSES = %w[pending previewing imported failed].freeze
6162
SCRAPE_METHODS = %w[auto manual_html].freeze
6263
CONNECTORS = %w[http playwright].freeze
64+
EXTRACTION_SOURCES = %w[external local manual].freeze
6365

6466
# Known property portals with specific parsing requirements
6567
KNOWN_PORTALS = {
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# frozen_string_literal: true
2+
3+
module Pwb
4+
# HTTP client for the Property Web Scraper (PWS) external extraction service.
5+
# Sends URL + HTML to PWS and receives structured {asset_data, listing_data, images}.
6+
#
7+
# Usage:
8+
# result = Pwb::ExternalScraperClient.new(url: "https://...", html: "<html>...").call
9+
# result.success # => true
10+
# result.extracted_data # => { "asset_data" => {...}, "listing_data" => {...} }
11+
# result.extracted_images # => ["https://..."]
12+
#
13+
class ExternalScraperClient
14+
DEFAULT_TIMEOUT = 15
15+
16+
Result = Struct.new(:success, :extracted_data, :extracted_images, :portal, :extraction_rate, :error, keyword_init: true)
17+
18+
# Base error class
19+
class Error < StandardError; end
20+
class UnsupportedPortalError < Error; end
21+
class ExtractionFailedError < Error; end
22+
class ConnectionError < Error; end
23+
24+
ERROR_CODE_MAP = {
25+
"UNSUPPORTED_HOST" => UnsupportedPortalError,
26+
"EXTRACTION_FAILED" => ExtractionFailedError
27+
}.freeze
28+
29+
attr_reader :url, :html
30+
31+
def initialize(url:, html:)
32+
@url = url
33+
@html = html
34+
end
35+
36+
# POST to PWS extraction endpoint
37+
# @return [Result]
38+
def call
39+
response = connection.post("/public_api/v1/listings") do |req|
40+
req.params["format"] = "pwb"
41+
req.headers["X-Api-Key"] = api_key
42+
req.headers["Content-Type"] = "application/json"
43+
req.body = { url: url, html: html }.to_json
44+
end
45+
46+
parse_response(response)
47+
rescue Faraday::TimeoutError => e
48+
raise ConnectionError, "PWS request timed out: #{e.message}"
49+
rescue Faraday::ConnectionFailed => e
50+
raise ConnectionError, "PWS connection failed: #{e.message}"
51+
end
52+
53+
# Check if the external scraper is enabled via ENV
54+
def self.enabled?
55+
ENV["PWS_API_URL"].present? && ENV["PWS_ENABLED"] != "false"
56+
end
57+
58+
# Check if the PWS service is healthy
59+
# @return [Boolean]
60+
def self.healthy?
61+
return false unless enabled?
62+
63+
response = build_connection.get("/public_api/v1/health")
64+
response.status == 200
65+
rescue Faraday::Error
66+
false
67+
end
68+
69+
# Fetch supported portals from PWS
70+
# @return [Array<String>] list of portal identifiers
71+
def self.supported_portals
72+
return [] unless enabled?
73+
74+
response = build_connection.get("/public_api/v1/supported_sites") do |req|
75+
req.headers["X-Api-Key"] = ENV["PWS_API_KEY"]
76+
end
77+
78+
body = response.body
79+
body.is_a?(Hash) ? (body["portals"] || body["sites"] || []) : []
80+
rescue Faraday::Error
81+
[]
82+
end
83+
84+
private
85+
86+
def parse_response(response)
87+
body = response.body
88+
89+
if body["success"]
90+
data = body["data"] || {}
91+
Result.new(
92+
success: true,
93+
extracted_data: {
94+
"asset_data" => data["asset_data"],
95+
"listing_data" => data["listing_data"]
96+
},
97+
extracted_images: data["images"] || [],
98+
portal: body["portal"],
99+
extraction_rate: body["extraction_rate"]
100+
)
101+
else
102+
error_info = body["error"] || {}
103+
error_code = error_info["code"]
104+
error_message = error_info["message"] || "Unknown PWS error"
105+
106+
error_class = ERROR_CODE_MAP[error_code] || Error
107+
raise error_class, error_message
108+
end
109+
end
110+
111+
def connection
112+
self.class.build_connection
113+
end
114+
115+
def api_key
116+
ENV["PWS_API_KEY"]
117+
end
118+
119+
def self.build_connection
120+
Faraday.new(url: ENV["PWS_API_URL"]) do |f|
121+
f.request :json
122+
f.response :json
123+
f.adapter Faraday.default_adapter
124+
f.options.timeout = timeout_seconds
125+
f.options.open_timeout = 5
126+
end
127+
end
128+
129+
def self.timeout_seconds
130+
(ENV["PWS_TIMEOUT"] || DEFAULT_TIMEOUT).to_i
131+
end
132+
end
133+
end

app/services/pwb/property_scraper_service.rb

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,10 @@ def call
4242
scrape_error_message: nil
4343
)
4444

45-
# Parse and extract data using appropriate pasarela
46-
pasarela = select_pasarela
47-
pasarela.call
45+
# Extract data: try external PWS first, fall back to local pasarela
46+
extract_data
4847

49-
# Reload to get the extracted_data saved by pasarela
48+
# Reload to get the extracted_data saved by extraction
5049
scraped_property.reload
5150
scraped_property.update!(import_status: "previewing")
5251
else
@@ -75,18 +74,52 @@ def import_from_manual_html(html)
7574
scrape_error_message: nil
7675
)
7776

78-
# Parse and extract data
79-
pasarela = select_pasarela
80-
pasarela.call
77+
# Manual HTML always uses local pasarela (skip PWS)
78+
extract_with_local_pasarela
8179

8280
# Reload to get the extracted_data saved by pasarela
8381
scraped_property.reload
84-
scraped_property.update!(import_status: "previewing")
82+
scraped_property.update!(import_status: "previewing", extraction_source: "manual")
8583
scraped_property
8684
end
8785

8886
private
8987

88+
# Try external PWS extraction first, fall back to local pasarela
89+
def extract_data
90+
if ExternalScraperClient.enabled?
91+
begin
92+
result = ExternalScraperClient.new(
93+
url: scraped_property.source_url,
94+
html: scraped_property.raw_html
95+
).call
96+
97+
scraped_property.update!(
98+
extracted_data: result.extracted_data,
99+
extracted_images: result.extracted_images,
100+
extraction_source: "external"
101+
)
102+
return
103+
rescue ExternalScraperClient::UnsupportedPortalError => e
104+
Rails.logger.info "[PWS] Unsupported portal, falling back to local: #{e.message}"
105+
rescue ExternalScraperClient::ConnectionError => e
106+
Rails.logger.warn "[PWS] Connection error, falling back to local: #{e.message}"
107+
rescue ExternalScraperClient::Error => e
108+
Rails.logger.warn "[PWS] Extraction error, falling back to local: #{e.message}"
109+
end
110+
end
111+
112+
extract_with_local_pasarela
113+
end
114+
115+
# Parse with local pasarela and set extraction_source
116+
def extract_with_local_pasarela
117+
pasarela = select_pasarela
118+
pasarela.call
119+
scraped_property.reload
120+
scraped_property.update!(extraction_source: "local") unless scraped_property.extraction_source.present?
121+
end
122+
90123
def normalize_url(url_string)
91124
uri = URI.parse(url_string.strip)
92125
"#{uri.host}#{uri.path}".downcase.gsub(%r{/$}, "")
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# frozen_string_literal: true
2+
3+
class AddExtractionSourceToScrapedProperties < ActiveRecord::Migration[7.1]
4+
def change
5+
add_column :pwb_scraped_properties, :extraction_source, :string
6+
end
7+
end

db/schema.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
#
1111
# It's strongly recommended that you check this file into your version control system.
1212

13-
ActiveRecord::Schema[8.1].define(version: 2026_02_13_000040) do
13+
ActiveRecord::Schema[8.1].define(version: 2026_02_18_000000) do
1414
# These are extensions that must be enabled in order to support this database
1515
enable_extension "pg_catalog.plpgsql"
1616
enable_extension "pgcrypto"
@@ -1016,6 +1016,7 @@
10161016
t.datetime "created_at", null: false
10171017
t.jsonb "extracted_data", default: {}
10181018
t.jsonb "extracted_images", default: []
1019+
t.string "extraction_source"
10191020
t.string "import_status", default: "pending"
10201021
t.datetime "imported_at"
10211022
t.text "raw_html"

spec/factories/pwb_scraped_properties.rb

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# connector_used :string
1010
# extracted_data :jsonb
1111
# extracted_images :jsonb
12+
# extraction_source :string
1213
# import_status :string default("pending")
1314
# imported_at :datetime
1415
# raw_html :text
@@ -50,6 +51,7 @@
5051
scrape_method { "auto" }
5152
connector_used { "http" }
5253
import_status { "previewing" }
54+
extraction_source { "local" }
5355
raw_html { "<html><head><title>Test Property</title></head><body>Content</body></html>" }
5456
extracted_data do
5557
{
@@ -78,9 +80,39 @@
7880
scrape_method { "manual_html" }
7981
connector_used { nil }
8082
import_status { "previewing" }
83+
extraction_source { "manual" }
8184
raw_html { "<html><head><title>Test Property</title></head><body>Content</body></html>" }
8285
end
8386

87+
trait :with_external_scrape do
88+
scrape_successful { true }
89+
scrape_method { "auto" }
90+
connector_used { "http" }
91+
import_status { "previewing" }
92+
extraction_source { "external" }
93+
raw_html { "<html><head><title>Test Property</title></head><body>Content</body></html>" }
94+
extracted_data do
95+
{
96+
"asset_data" => {
97+
"count_bedrooms" => 3,
98+
"count_bathrooms" => 2,
99+
"city" => "London",
100+
"postal_code" => "SW1A 1AA",
101+
"country" => "UK",
102+
"prop_type_key" => "apartment",
103+
"constructed_area" => 120
104+
},
105+
"listing_data" => {
106+
"title" => "Beautiful 3 Bedroom Apartment",
107+
"description" => "A stunning property in the heart of London.",
108+
"price_sale_current" => 450_000,
109+
"currency" => "GBP"
110+
}
111+
}
112+
end
113+
extracted_images { ["https://example.com/image1.jpg", "https://example.com/image2.jpg"] }
114+
end
115+
84116
trait :with_failed_scrape do
85117
scrape_successful { false }
86118
scrape_error_message { "Request blocked by Cloudflare or bot protection." }

spec/models/pwb/scraped_property_spec.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# connector_used :string
1010
# extracted_data :jsonb
1111
# extracted_images :jsonb
12+
# extraction_source :string
1213
# import_status :string default("pending")
1314
# imported_at :datetime
1415
# raw_html :text

0 commit comments

Comments
 (0)