Skip to content

feat/add-similar-objects #7

@DefJM

Description

@DefJM

Hi @asmaier

I'd like to create a pull request to add "similar objects" in the parsing section of immoscout.py.
Can you give me the rights so I can do so? Currently I don't seem to have permission.

Thanks a lot!
Jan

# -*- coding: utf-8 -*-
import scrapy
import json
from immospider.items import ImmoscoutItem


class ImmoscoutSpider(scrapy.Spider):
    name = "immoscout"
    allowed_domains = ["immobilienscout24.de"]
    # start_urls = ['https://www.immobilienscout24.de/Suche/S-2/Wohnung-Miete/Berlin/Berlin']
    # start_urls = ['https://www.immobilienscout24.de/Suche/S-2/Wohnung-Miete/Berlin/Berlin/Lichterfelde-Steglitz_Nikolassee-Zehlendorf_Dahlem-Zehlendorf_Zehlendorf-Zehlendorf/2,50-/60,00-/EURO--800,00/-/-/']

    # The immoscout search results are stored as json inside their javascript. This makes the parsing very easy.
    # I learned this trick from https://github.com/balzer82/immoscraper/blob/master/immoscraper.ipynb .
    script_xpath = './/script[contains(., "IS24.resultList")]'
    next_xpath = '//div[@id = "pager"]/div/a/@href'

    def start_requests(self):
        yield scrapy.Request(self.url)

    def parse(self, response):

        print(response.url)

        for line in response.xpath(self.script_xpath).extract_first().split('\n'):
            if line.strip().startswith('resultListModel'):
                immo_json = line.strip()
                immo_json = json.loads(immo_json[17:-1])

                #TODO: On result pages with just a single result resultlistEntry is not a list, but a dictionary.
                #TODO: So extracting data will fail.
                for result in immo_json["searchResponseModel"]["resultlist.resultlist"]["resultlistEntries"][0]["resultlistEntry"]:
                    item = self.parse_result(result, response)
                    yield item

                    # check for and parse "similar objects" with additional matching results in json body
                    if "similarObjects" in result:
                        for i in result["similarObjects"][0]["similarObject"]:
                            item = self.parse_data_object(i, response)
                            yield item

        next_page_list = response.xpath(self.next_xpath).extract()
        if next_page_list:
            next_page = next_page_list[-1]
            print("Scraping next page", next_page)
            if next_page:
                next_page = response.urljoin(next_page)
                yield scrapy.Request(next_page, callback=self.parse)

    def parse_result(self, result, response):
        """parse json result for each site

        :param result: [description]
        :type result: [type]
        """
        item = ImmoscoutItem()
        data = result["resultlist.realEstate"]

        item["immo_id"] = data["@id"]
        item["url"] = response.urljoin("/expose/" + str(data["@id"]))
        item["title"] = data["title"]
        address = data["address"]
        try:
            item["address"] = address["street"] + " " + address["houseNumber"]
        except:
            item["address"] = None
        item["city"] = address["city"]
        item["zip_code"] = address["postcode"]
        item["district"] = address["quarter"]

        item["rent"] = data["price"]["value"]
        item["sqm"] = data["livingSpace"]
        item["rooms"] = data["numberOfRooms"]

        if "calculatedPrice" in data:
            item["extra_costs"] = (
                data["calculatedPrice"]["value"] - data["price"]["value"]
            )
        if "builtInKitchen" in data:
            item["kitchen"] = data["builtInKitchen"]
        if "balcony" in data:
            item["balcony"] = data["balcony"]
        if "garden" in data:
            item["garden"] = data["garden"]
        if "privateOffer" in data:
            item["private"] = data["privateOffer"]
        if "plotArea" in data:
            item["area"] = data["plotArea"]
        if "cellar" in data:
            item["cellar"] = data["cellar"]

        try:
            contact = data["contactDetails"]
            item["contact_name"] = contact["firstname"] + " " + contact["lastname"]
        except:
            item["contact_name"] = None

        try:
            item["media_count"] = len(data["galleryAttachments"]["attachment"])
        except:
            item["media_count"] = 0

        try:
            item["lat"] = address["wgs84Coordinate"]["latitude"]
            item["lng"] = address["wgs84Coordinate"]["longitude"]
        except Exception as e:
            # print(e)
            item["lat"] = None
            item["lng"] = None

        print(item)

        return item

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions