diff --git a/BATDONGSAN_CRAWLER_README.md b/BATDONGSAN_CRAWLER_README.md new file mode 100644 index 000000000..cd894990a --- /dev/null +++ b/BATDONGSAN_CRAWLER_README.md @@ -0,0 +1,235 @@ +# BatDongSan.com.vn Crawler + +Script để crawl dữ liệu bất động sản từ batdongsan.com.vn bằng Crawl4AI. + +## Tính năng + +- ✅ Crawl danh sách tin rao bất động sản +- ✅ Trích xuất thông tin chi tiết (tiêu đề, giá, diện tích, địa chỉ, số phòng ngủ, v.v.) +- ✅ Hỗ trợ phân trang (crawl nhiều trang) +- ✅ Xuất dữ liệu ra file JSON +- ✅ Xử lý nội dung động (JavaScript) +- ✅ Cấu hình linh hoạt + +## Cài đặt + +1. Đảm bảo bạn đã cài đặt Crawl4AI: +```bash +pip install crawl4ai +``` + +2. Cài đặt các dependencies cần thiết: +```bash +crawl4ai-setup +``` + +## Sử dụng + +### Cách 1: Chạy script mẫu + +Chạy trực tiếp script với ví dụ có sẵn: + +```bash +python crawl_batdongsan.py +``` + +Script sẽ crawl danh sách căn hộ bán tại TP.HCM và lưu vào thư mục `crawled_data/`. + +### Cách 2: Sử dụng trong code của bạn + +```python +import asyncio +from crawl_batdongsan import BatDongSanCrawler + +async def main(): + # Khởi tạo crawler + crawler = BatDongSanCrawler(headless=True) + + # URL cần crawl + url = "https://batdongsan.com.vn/ban-can-ho-chung-cu-tp-hcm" + + # Crawl 5 trang + properties = await crawler.crawl_multiple_pages(url, num_pages=5) + + # Lưu kết quả + crawler.save_to_json(properties, "my_properties.json") + + # In thống kê + print(f"Đã crawl {len(properties)} bất động sản") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### Cách 3: Crawl từ URL tùy chỉnh + +```python +import asyncio +from crawl_batdongsan import BatDongSanCrawler + +async def crawl_custom(): + crawler = BatDongSanCrawler(headless=True) + + # Thay đổi URL theo nhu cầu của bạn + urls = [ + "https://batdongsan.com.vn/ban-nha-rieng-ha-noi", + "https://batdongsan.com.vn/cho-thue-van-phong-tp-hcm", + "https://batdongsan.com.vn/ban-dat-nen-binh-duong" + ] + + all_data = [] + for url in urls: + properties = await crawler.crawl_multiple_pages(url, num_pages=3) + all_data.extend(properties) + + crawler.save_to_json(all_data, "all_properties.json") + +asyncio.run(crawl_custom()) +``` + +## Cấu trúc dữ liệu + +Mỗi bất động sản được trích xuất sẽ có các trường sau: + +```json +{ + "title": "Tiêu đề tin đăng", + "link": "URL chi tiết", + "price": "Giá bán/cho thuê", + "area": "Diện tích", + "location": "Địa chỉ", + "bedrooms": "Số phòng ngủ", + "toilets": "Số toilet", + "description": "Mô tả ngắn", + "image": "URL hình ảnh", + "publish_date": "Ngày đăng" +} +``` + +## Tùy chỉnh CSS Selectors + +Nếu cấu trúc website thay đổi, bạn có thể cập nhật CSS selectors trong method `get_listing_schema()`: + +```python +def get_listing_schema(self): + schema = { + "name": "BatDongSan Property Listings", + "baseSelector": "div.re__card-full", # Container chính + "fields": [ + { + "name": "title", + "selector": "a.pr-title", # Cập nhật selector + "type": "text", + }, + # Thêm hoặc sửa các trường khác... + ] + } + return schema +``` + +## Các ví dụ trong script + +Script có sẵn 3 ví dụ: + +1. **example_crawl_ban_can_ho()**: Crawl căn hộ bán tại TP.HCM +2. **example_crawl_cho_thue_nha()**: Crawl nhà cho thuê tại Hà Nội +3. **example_custom_url()**: Crawl từ URL tùy chỉnh với thống kê + +Để chạy các ví dụ khác, uncomment trong hàm `main()`. + +## Lưu ý quan trọng + +⚠️ **Tuân thủ robots.txt và Terms of Service** +- Kiểm tra `robots.txt` của batdongsan.com.vn +- Tôn trọng giới hạn crawl rate +- Sử dụng dữ liệu hợp pháp và có đạo đức + +⚠️ **CSS Selectors có thể thay đổi** +- Website có thể cập nhật cấu trúc HTML +- Nếu không crawl được dữ liệu, cần kiểm tra và cập nhật selectors +- Sử dụng DevTools của browser để inspect elements + +⚠️ **Performance** +- Script có delay giữa các requests để tránh overload server +- Có thể điều chỉnh `delay_before_return_html` và `asyncio.sleep()` nếu cần + +## Khắc phục sự cố + +### Không crawl được dữ liệu + +1. Kiểm tra URL có đúng không +2. Thử chạy với `headless=False` để xem browser +3. Kiểm tra CSS selectors có còn đúng không +4. Tăng `delay_before_return_html` nếu trang load chậm + +### Lỗi timeout + +```python +crawler_config = CrawlerRunConfig( + # ... other config ... + page_timeout=60000, # Tăng timeout lên 60 giây +) +``` + +### Crawl chậm + +- Giảm số lượng trang crawl +- Sử dụng caching +- Chạy song song nhiều URLs + +## Mở rộng + +### Crawl chi tiết từng bất động sản + +```python +async def crawl_property_detail(self, url: str): + """Crawl chi tiết một bất động sản""" + # Define schema cho trang chi tiết + detail_schema = { + "baseSelector": "div.re__pr-specs-content", + "fields": [ + {"name": "full_description", "selector": ".re__detail-content", "type": "text"}, + {"name": "contact_name", "selector": ".re__contact-name", "type": "text"}, + # ... thêm fields khác + ] + } + # Implement crawl logic... +``` + +### Lưu vào Database + +```python +import sqlite3 + +def save_to_database(properties): + conn = sqlite3.connect('batdongsan.db') + cursor = conn.cursor() + + cursor.execute(''' + CREATE TABLE IF NOT EXISTS properties ( + id INTEGER PRIMARY KEY, + title TEXT, + price TEXT, + area TEXT, + location TEXT, + link TEXT UNIQUE + ) + ''') + + for prop in properties: + cursor.execute(''' + INSERT OR IGNORE INTO properties (title, price, area, location, link) + VALUES (?, ?, ?, ?, ?) + ''', (prop['title'], prop['price'], prop['area'], prop['location'], prop['link'])) + + conn.commit() + conn.close() +``` + +## License + +MIT License - Sử dụng tự do với trách nhiệm cá nhân. + +## Đóng góp + +Mọi đóng góp và cải tiến đều được chào đón! diff --git a/crawl_batdongsan.py b/crawl_batdongsan.py new file mode 100644 index 000000000..b7fa1cd1a --- /dev/null +++ b/crawl_batdongsan.py @@ -0,0 +1,312 @@ +""" +Crawler for batdongsan.com.vn - Vietnamese Real Estate Website +This script crawls property listings from batdongsan.com.vn and extracts structured data. + +Usage: + python crawl_batdongsan.py + +Features: +- Crawl property listings with detailed information +- Support for pagination +- Export data to JSON format +- Handle dynamic content loading +""" + +import asyncio +import json +import os +from datetime import datetime +from typing import List, Dict +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + + +class BatDongSanCrawler: + """Crawler for batdongsan.com.vn""" + + def __init__(self, headless: bool = True): + """ + Initialize the crawler + + Args: + headless: Run browser in headless mode (default: True) + """ + self.browser_config = BrowserConfig( + headless=headless, + verbose=True, + java_script_enabled=True + ) + + def get_listing_schema(self): + """ + Define CSS extraction schema for property listings + + Returns: + dict: CSS extraction schema + """ + schema = { + "name": "BatDongSan Property Listings", + "baseSelector": "div.re__card-full", # Main container for each listing + "fields": [ + { + "name": "title", + "selector": "a.pr-title", + "type": "text", + }, + { + "name": "link", + "selector": "a.pr-title", + "type": "attribute", + "attribute": "href" + }, + { + "name": "price", + "selector": "span.re__card-config-price", + "type": "text", + }, + { + "name": "area", + "selector": "span.re__card-config-area", + "type": "text", + }, + { + "name": "location", + "selector": "div.re__card-location", + "type": "text", + }, + { + "name": "bedrooms", + "selector": "span.re__card-config-bedroom", + "type": "text", + }, + { + "name": "toilets", + "selector": "span.re__card-config-toilet", + "type": "text", + }, + { + "name": "description", + "selector": "div.re__card-description", + "type": "text", + }, + { + "name": "image", + "selector": "img.re__card-image", + "type": "attribute", + "attribute": "src" + }, + { + "name": "publish_date", + "selector": "span.re__card-published-info-date", + "type": "text", + } + ] + } + return schema + + async def crawl_listing_page(self, url: str, session_id: str = None) -> List[Dict]: + """ + Crawl a single listing page + + Args: + url: URL of the listing page + session_id: Optional session ID to maintain state across requests + + Returns: + List of property dictionaries + """ + extraction_strategy = JsonCssExtractionStrategy(self.get_listing_schema()) + + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=extraction_strategy, + session_id=session_id, + wait_until="networkidle", # Wait for network to be idle + delay_before_return_html=2 # Wait 2 seconds for dynamic content + ) + + async with AsyncWebCrawler(config=self.browser_config) as crawler: + result = await crawler.arun(url=url, config=crawler_config) + + if result.success: + try: + properties = json.loads(result.extracted_content) + return properties + except json.JSONDecodeError as e: + print(f"Error parsing JSON: {e}") + return [] + else: + print(f"Failed to crawl {url}") + return [] + + async def crawl_multiple_pages(self, base_url: str, num_pages: int = 3) -> List[Dict]: + """ + Crawl multiple pages of listings + + Args: + base_url: Base URL for the search/category + num_pages: Number of pages to crawl (default: 3) + + Returns: + List of all properties from all pages + """ + all_properties = [] + + for page_num in range(1, num_pages + 1): + # Construct URL for pagination + # batdongsan.com.vn typically uses /p{page_num} for pagination + if page_num == 1: + url = base_url + else: + # Add pagination parameter (adjust based on actual URL structure) + url = f"{base_url}/p{page_num}" + + print(f"\n{'='*60}") + print(f"Crawling page {page_num}: {url}") + print(f"{'='*60}") + + properties = await self.crawl_listing_page(url) + + if properties: + all_properties.extend(properties) + print(f"Found {len(properties)} properties on page {page_num}") + else: + print(f"No properties found on page {page_num}") + # If no results, might have reached the last page + if page_num > 1: + break + + # Add delay between pages to be respectful + if page_num < num_pages: + await asyncio.sleep(2) + + return all_properties + + def save_to_json(self, data: List[Dict], filename: str = None): + """ + Save crawled data to JSON file + + Args: + data: List of property dictionaries + filename: Output filename (default: batdongsan_YYYYMMDD_HHMMSS.json) + """ + if filename is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"batdongsan_{timestamp}.json" + + output_dir = "crawled_data" + os.makedirs(output_dir, exist_ok=True) + filepath = os.path.join(output_dir, filename) + + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + print(f"\n{'='*60}") + print(f"Data saved to: {filepath}") + print(f"Total properties: {len(data)}") + print(f"{'='*60}") + + return filepath + + +async def example_crawl_ban_can_ho(): + """ + Example: Crawl apartment listings for sale + """ + print("Example: Crawling apartment listings for sale in Ho Chi Minh City") + print("="*60) + + crawler = BatDongSanCrawler(headless=True) + + # Example URL for apartments for sale in HCMC + # Note: Update this URL based on actual batdongsan.com.vn structure + url = "https://batdongsan.com.vn/ban-can-ho-chung-cu-tp-hcm" + + # Crawl 3 pages + properties = await crawler.crawl_multiple_pages(url, num_pages=3) + + # Save to JSON + if properties: + filepath = crawler.save_to_json(properties, "ban_can_ho_hcm.json") + + # Print sample data + print("\nSample property:") + print(json.dumps(properties[0], ensure_ascii=False, indent=2)) + else: + print("No properties found") + + +async def example_crawl_cho_thue_nha(): + """ + Example: Crawl house rental listings + """ + print("\nExample: Crawling house rental listings in Hanoi") + print("="*60) + + crawler = BatDongSanCrawler(headless=True) + + # Example URL for house rentals in Hanoi + url = "https://batdongsan.com.vn/cho-thue-nha-rieng-ha-noi" + + # Crawl 2 pages + properties = await crawler.crawl_multiple_pages(url, num_pages=2) + + # Save to JSON + if properties: + filepath = crawler.save_to_json(properties, "cho_thue_nha_hanoi.json") + else: + print("No properties found") + + +async def example_custom_url(): + """ + Example: Crawl from a custom URL + You can modify this to use any search URL from batdongsan.com.vn + """ + print("\nExample: Custom URL crawling") + print("="*60) + + crawler = BatDongSanCrawler(headless=True) + + # Replace this with your desired URL + custom_url = "https://batdongsan.com.vn/ban-nha-rieng-tp-hcm" + + properties = await crawler.crawl_multiple_pages(custom_url, num_pages=2) + + if properties: + crawler.save_to_json(properties) + + # Print statistics + print(f"\nCrawling Statistics:") + print(f"Total properties: {len(properties)}") + + # Count properties by location if available + locations = {} + for prop in properties: + location = prop.get('location', 'Unknown') + locations[location] = locations.get(location, 0) + 1 + + print("\nTop 5 locations:") + sorted_locations = sorted(locations.items(), key=lambda x: x[1], reverse=True)[:5] + for location, count in sorted_locations: + print(f" {location}: {count} properties") + + +async def main(): + """ + Main function with multiple examples + """ + print("="*60) + print("BatDongSan.com.vn Crawler") + print("="*60) + + # Run example - crawl apartments for sale + await example_crawl_ban_can_ho() + + # Uncomment to run other examples: + # await example_crawl_cho_thue_nha() + # await example_custom_url() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/crawl_batdongsan_simple.py b/crawl_batdongsan_simple.py new file mode 100644 index 000000000..561ec46f5 --- /dev/null +++ b/crawl_batdongsan_simple.py @@ -0,0 +1,224 @@ +""" +Simple example script to crawl batdongsan.com.vn +Quickly modify the URL and run to get property data + +Usage: + python crawl_batdongsan_simple.py +""" + +import asyncio +import json +import os +from datetime import datetime +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode +from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + + +async def crawl_batdongsan(url: str, num_pages: int = 1): + """ + Crawl batdongsan.com.vn and extract property listings + + Args: + url: The URL to crawl (e.g., search page or category page) + num_pages: Number of pages to crawl + + Returns: + List of property dictionaries + """ + # Configure browser + browser_config = BrowserConfig( + headless=True, # Set to False to see the browser + verbose=True, + java_script_enabled=True + ) + + # Define what data to extract using CSS selectors + # Note: You may need to update these selectors if the website structure changes + extraction_schema = { + "name": "BatDongSan Properties", + "baseSelector": "div.re__card-full", # Main container for each property + "fields": [ + { + "name": "title", + "selector": "a.pr-title", + "type": "text", + }, + { + "name": "link", + "selector": "a.pr-title", + "type": "attribute", + "attribute": "href" + }, + { + "name": "price", + "selector": "span.re__card-config-price", + "type": "text", + }, + { + "name": "area", + "selector": "span.re__card-config-area", + "type": "text", + }, + { + "name": "location", + "selector": "div.re__card-location", + "type": "text", + }, + { + "name": "bedrooms", + "selector": "span.re__card-config-bedroom", + "type": "text", + }, + { + "name": "description", + "selector": "div.re__card-description", + "type": "text", + }, + { + "name": "image", + "selector": "img.re__card-image", + "type": "attribute", + "attribute": "src" + } + ] + } + + all_properties = [] + + async with AsyncWebCrawler(config=browser_config) as crawler: + for page_num in range(1, num_pages + 1): + # Build URL for each page + if page_num == 1: + page_url = url + else: + page_url = f"{url}/p{page_num}" + + print(f"\nCrawling page {page_num}: {page_url}") + + # Configure crawler + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + extraction_strategy=JsonCssExtractionStrategy(extraction_schema), + wait_until="networkidle", + delay_before_return_html=2 # Wait for content to load + ) + + # Crawl the page + result = await crawler.arun(url=page_url, config=crawler_config) + + if result.success: + try: + properties = json.loads(result.extracted_content) + all_properties.extend(properties) + print(f"✓ Found {len(properties)} properties on page {page_num}") + except json.JSONDecodeError: + print(f"✗ Error parsing data from page {page_num}") + else: + print(f"✗ Failed to crawl page {page_num}") + + # Delay between pages + if page_num < num_pages: + await asyncio.sleep(2) + + return all_properties + + +def save_results(properties, filename=None): + """Save properties to JSON file""" + if not properties: + print("\nNo properties to save!") + return + + # Create output directory + output_dir = "crawled_data" + os.makedirs(output_dir, exist_ok=True) + + # Generate filename if not provided + if filename is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"batdongsan_{timestamp}.json" + + filepath = os.path.join(output_dir, filename) + + # Save to file + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(properties, f, ensure_ascii=False, indent=2) + + print(f"\n{'='*60}") + print(f"✓ Saved {len(properties)} properties to: {filepath}") + print(f"{'='*60}") + + return filepath + + +async def main(): + """Main function""" + print("="*60) + print("BatDongSan.com.vn Crawler - Simple Version") + print("="*60) + + # ======================================== + # CUSTOMIZE THESE SETTINGS + # ======================================== + + # URL to crawl - Change this to your desired category/search URL + # Examples: + # - Apartments for sale in HCMC: https://batdongsan.com.vn/ban-can-ho-chung-cu-tp-hcm + # - Houses for rent in Hanoi: https://batdongsan.com.vn/cho-thue-nha-rieng-ha-noi + # - Land for sale in Binh Duong: https://batdongsan.com.vn/ban-dat-nen-binh-duong + + URL = "https://batdongsan.com.vn/ban-can-ho-chung-cu-tp-hcm" + + # Number of pages to crawl + NUM_PAGES = 3 + + # Output filename (optional) + OUTPUT_FILE = "properties.json" + + # ======================================== + + print(f"\nURL: {URL}") + print(f"Pages to crawl: {NUM_PAGES}") + + # Crawl the website + properties = await crawl_batdongsan(URL, num_pages=NUM_PAGES) + + # Save results + if properties: + save_results(properties, OUTPUT_FILE) + + # Print sample property + print("\nSample property:") + print(json.dumps(properties[0], ensure_ascii=False, indent=2)) + + # Print statistics + print(f"\nTotal properties crawled: {len(properties)}") + + # Show price range if available + prices = [p.get('price', '') for p in properties if p.get('price')] + if prices: + print(f"Found {len(prices)} properties with prices") + + # Show locations + locations = {} + for prop in properties: + loc = prop.get('location', 'Unknown') + if loc: + locations[loc] = locations.get(loc, 0) + 1 + + if locations: + print("\nTop 5 locations:") + sorted_locs = sorted(locations.items(), key=lambda x: x[1], reverse=True)[:5] + for loc, count in sorted_locs: + print(f" - {loc}: {count} properties") + + else: + print("\n✗ No properties found. This might mean:") + print(" 1. The URL is incorrect") + print(" 2. The CSS selectors need to be updated") + print(" 3. The website structure has changed") + print("\nTry running with headless=False to see what's happening") + + +if __name__ == "__main__": + asyncio.run(main())