Robotslxml.txt #5108

Wichitrawinu · 2026-02-10T16:21:11Z

Wichitrawinu
Feb 10, 2026

pip install -r requirements.txt
python main_bot.py
my_crawler/
├── main_bot.py # โค้ดหลักที่เราเขียนกัน
├── crawler_cache.db # ไฟล์ฐานข้อมูล (จะเกิดขึ้นเอง)
├── requirements.txt # ไฟล์รายการ Library
└── README.md # คู่มือการใช้งาน
import smtplib
import schedule
import time
from email.message import EmailMessage

class AutomatedCrawler(UltimateCrawler):
def init(self, email_config):
super().init()
self.email_config = email_config

def send_report(self, filename):
    """ส่งไฟล์รายงานผ่าน Email"""
    msg = EmailMessage()
    msg['Subject'] = f"🚀 Crawler Report: {time.strftime('%Y-%m-%d')}"
    msg['From'] = self.email_config['sender_email']
    msg['To'] = self.email_config['receiver_email']
    msg.set_content(f"บอททำงานเสร็จสิ้นแล้ว! พบข้อมูลใหม่และบันทึกลงใน {filename}")

    # แนบไฟล์ผลลัพธ์
    with open(filename, 'rb') as f:
        file_data = f.read()
        msg.add_attachment(file_data, maintype='application', subtype='octet-stream', filename=filename)

    try:
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as smtp:
            smtp.login(self.email_config['sender_email'], self.email_config['password'])
            smtp.send_message(msg)
        print("📧 ส่งอีเมลรายงานเรียบร้อย!")
    except Exception as e:
        print(f"❌ ส่งอีเมลไม่สำเร็จ: {e}")

--- การตั้งค่าและเริ่มการทำงาน ---

CONFIG = {
'sender_email': 'your_email@gmail.com',
'password': 'your_app_password', # ใช้ App Password ของ Google
'receiver_email': 'target_email@gmail.com'
}

def job():
print(f"⏰ เริ่มรอบการทำงาน ณ เวลา {time.ctime()}")
bot = AutomatedCrawler(CONFIG)

# ตัวอย่างขั้นตอนการทำงาน
# 1. โหลด Sitemap และดึงข้อมูล
# 2. บันทึกเป็นไฟล์ CSV
filename = "daily_report.csv"
# bot.save_to_csv(data_list, filename)

# 3. ส่งเมล์รายงาน
bot.send_report(filename)

--- ตั้งเวลา (Scheduler) ---

ทำงานทุกวันเวลา 08:30 น.

schedule.every().day.at("08:30").do(job)

หรือจะให้ทำทุกๆ 1 ชั่วโมงก็ได้:

schedule.every(1).hours.do(job)

print("🔋 ระบบ Scheduler เริ่มทำงานแล้ว... (กด Ctrl+C เพื่อหยุด)")
while True:
schedule.run_pending()
time.sleep(60) # ตรวจสอบตารางเวลาทุก 1 นาที
def solve_captcha(site_key, page_url):
api_key = "YOUR_2CAPTCHA_API_KEY"
# ส่งคำขอไปที่ Service
post_url = f"http://2captcha.com/in.php?key={api_key}&method=userrecaptcha&googlekey={site_key}&pageurl={page_url}"
response = requests.get(post_url).text

if 'OK|' not in response:
    return None
    
captcha_id = response.split('|')[1]
# รอผลลัพธ์ (ปกติใช้เวลา 15-30 วินาที)
fetch_url = f"http://2captcha.com/res.php?key={api_key}&action=get&id={captcha_id}"

while True:
    time.sleep(5)
    res = requests.get(fetch_url).text
    if res == 'CAPCHA_NOT_READY': continue
    if 'OK|' in res:
        return res.split('|')[1] # นี่คือ Token ที่เอาไปใส่ในฟอร์มเพื่อผ่านด่าน

requests==2.31.0
lxml==5.1.0
beautifulsoup4==4.12.3
schedule==1.2.1
import requests
from lxml import etree
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin

class UltimateCrawler(SmartCrawler): # สืบทอดความสามารถ SQLite มาจากโค้ดก่อนหน้า
def init(self, user_agent="MyBot/1.0"):
super().init()
self.headers = {"User-Agent": user_agent}

def get_all_urls_recursive(self, sitemap_url):
    """มุดอ่าน Sitemap ทุกชั้น (Recursive)"""
    all_pages = []
    try:
        response = requests.get(sitemap_url, headers=self.headers, timeout=10)
        if response.status_code != 200: return []
        
        root = etree.fromstring(response.content)
        # 1. เช็คว่าเป็น Sitemap Index หรือไม่ (มี <sitemap>)
        sitemaps = root.xpath("//*[local-name()='sitemap']")
        if sitemaps:
            for s in sitemaps:
                loc = s.xpath(".//*[local-name()='loc']/text()")[0]
                all_pages.extend(self.get_all_urls_recursive(loc)) # เรียกตัวเองซ้ำ
        
        # 2. ถ้าเป็น Sitemap ปกติ (มี <url>)
        urls = root.xpath("//*[local-name()='url']")
        for u in urls:
            loc = u.xpath(".//*[local-name()='loc']/text()")[0]
            all_pages.append(loc)
            
    except Exception as e:
        print(f"⚠️ พลาดที่ {sitemap_url}: {e}")
    return list(set(all_pages)) # ลบตัวซ้ำ

def scrape_data(self, url):
    """ดึงข้อมูลเฉพาะจุดจากหน้าเว็บ (ตัวอย่าง: ดึงหัวข้อและเนื้อหา)"""
    try:
        response = requests.get(url, headers=self.headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # --- ปรับแต่งตรงนี้ตามโครงสร้างเว็บเป้าหมาย ---
        data = {
            "title": soup.find('h1').get_text(strip=True) if soup.find('h1') else "No Title",
            "description": soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else "No Desc"
        }
        # ----------------------------------------
        return data
    except Exception as e:
        return f"Error scraping {url}: {e}"

--- การใช้งานจริง ---

crawler = UltimateCrawler()

1. ขุดหารูปแบบ URL ทั้งหมดจาก Sitemap หลัก

target_sitemap = "https://www.example.com/sitemap_index.xml"
target_urls = crawler.get_all_urls_recursive(target_sitemap)

for url in target_urls:
if not crawler.is_visited(url):
print(f"🚀 กำลังดึงข้อมูลจาก: {url}")

    # 2. ทำการ Scrape ข้อมูล
    content = crawler.scrape_data(url)
    print(f"📄 ข้อมูลที่ได้: {content['title']}")
    
    # 3. บันทึกว่ามาแล้ว
    crawler.mark_as_visited(url)
    
    # 4. รอตามมารยาท
    time.sleep(1)

import requests
import re
import time
from urllib.parse import urljoin

class ProfessionalRobotsChecker:
def init(self, user_agent_name="*"):
self.target_agent = user_agent_name.lower()
self.rules = []
self.sitemaps = []
self.crawl_delay = 0 # วินาที

def _pattern_to_regex(self, pattern):
    regex = re.escape(pattern).replace(r'\*', '.*')
    if regex.endswith(r'\$'):
        regex = regex[:-2] + '$'
    return f"^{regex}"

def fetch_robots(self, site_url):
    robots_url = urljoin(site_url, "/robots.txt")
    try:
        response = requests.get(robots_url, timeout=10)
        if response.status_code != 200:
            return f"⚠️ ไม่พบ robots.txt (Status: {response.status_code})"
        
        current_agent = None
        for line in response.text.splitlines():
            line = line.strip()
            if not line or line.startswith('#'): continue

            if ':' in line:
                key, value = [item.strip() for item in line.split(':', 1)]
                key_lower = key.lower()

                # 1. ค้นหา Sitemap (เป็น Global ไม่ขึ้นกับ User-agent)
                if key_lower == 'sitemap':
                    self.sitemaps.append(value)
                
                # 2. คัดเลือก User-agent
                elif key_lower == 'user-agent':
                    current_agent = value.lower()
                
                # 3. เก็บกฎและ Crawl-delay เฉพาะของเรา หรือ Global (*)
                if current_agent == self.target_agent or current_agent == '*':
                    if key_lower in ['allow', 'disallow']:
                        self.rules.append({
                            'pattern': value,
                            'regex': re.compile(self._pattern_to_regex(value)),
                            'allow': (key_lower == 'allow'),
                            'length': len(value)
                        })
                    elif key_lower == 'crawl-delay':
                        self.crawl_delay = float(value)

        return "✅ โหลดกฎสำเร็จ!"
    except Exception as e:
        return f"❌ ผิดพลาด: {e}"

def can_fetch(self, path):
    matches = [r for r in self.rules if r['regex'].match(path)]
    if not matches: return True
    # ยึดตามหลัก Longest Match (RFC 9309)
    longest_match = max(matches, key=lambda x: x['length'])
    return longest_match['allow']

def wait_before_crawl(self):
    """ใช้สำหรับหน่วงเวลาตาม Crawl-delay"""
    if self.crawl_delay > 0:
        print(f"⏳ กำลังรอ {self.crawl_delay} วินาที ตามกฎ Crawl-delay...")
        time.sleep(self.crawl_delay)

--- วิธีการนำไปใช้ในโปรเจ็กต์ ---

bot = ProfessionalRobotsChecker(user_agent_name="MySuperBot")
print(bot.fetch_robots("https://www.google.com"))

1. แสดงรายการ Sitemap ที่เจอ

print(f"📂 Sitemaps found: {bot.sitemaps}")

2. เช็ค Crawl-delay

print(f"⏱️ Crawl-delay: {bot.crawl_delay} seconds")

3. จำลองการทำงานแบบสุภาพ (Polite Crawling)

target_path = "/search/how-to-code"
if bot.can_fetch(target_path):
bot.wait_before_crawl() # รอถ้าเว็บสั่งให้รอ
print(f"🚀 เริ่มดึงข้อมูลจาก {target_path}...")
else:
print(f"🚫 ถูกปฏิเสธการเข้าถึง {target_path}")
import requests
import sqlite3
import time
from lxml import etree
from urllib.parse import urljoin

class SmartCrawler:
def init(self, db_name="crawler_cache.db"):
# 1. ตั้งค่าฐานข้อมูล SQLite
self.conn = sqlite3.connect(db_name)
self._init_db()
self.sitemaps = []

def _init_db(self):
    cursor = self.conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS visited_urls (
            url TEXT PRIMARY KEY,
            visited_at DATETIME DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    self.conn.commit()

def is_visited(self, url):
    cursor = self.conn.cursor()
    cursor.execute('SELECT 1 FROM visited_urls WHERE url = ?', (url,))
    return cursor.fetchone() is not None

def mark_as_visited(self, url):
    try:
        cursor = self.conn.cursor()
        cursor.execute('INSERT INTO visited_urls (url) VALUES (?)', (url,))
        self.conn.commit()
    except sqlite3.IntegrityError:
        pass

def parse_sitemap(self, sitemap_url):
    """ดึง URL ทั้งหมดจาก Sitemap XML"""
    print(f"🔍 กำลังอ่าน Sitemap: {sitemap_url}")
    urls = []
    try:
        response = requests.get(sitemap_url, timeout=10)
        if response.status_code == 200:
            # ใช้ lxml แกะ XML namespaces
            root = etree.fromstring(response.content)
            # ดึงทุก <loc> tag (ซึ่งเก็บ URL)
            for loc in root.xpath("//*[local-name()='loc']"):
                urls.append(loc.text)
        print(f"✨ เจอทั้งหมด {len(urls)} ลิงก์")
    except Exception as e:
        print(f"❌ อ่าน Sitemap ไม่ได้: {e}")
    return urls

--- วิธีการนำมาประกอบร่าง (Integration Example) ---

สมมติว่าเราได้ sitemaps มาจาก ProfessionalRobotsChecker (โค้ดก่อนหน้า)

my_crawler = SmartCrawler()
found_urls = my_crawler.parse_sitemap("https://www.example.com/sitemap.xml")

for url in found_urls:
# 1. เช็คว่าเคยไปหรือยัง (SQLite)
if my_crawler.is_visited(url):
print(f"⏭️ ข้าม {url} (เคยไปแล้ว)")
continue

# 2. เช็คกฎ Robots.txt (ใช้ class ก่อนหน้านี้)
# if bot_checker.can_fetch(url): 

print(f"🚀 กำลังทำงานกับ: {url}")

# บันทึกลงฐานข้อมูลว่ามาแล้วนะ
my_crawler.mark_as_visited(url)

# หน่วงเวลาเพื่อความสุภาพ
time.sleep(1)

import csv
import json

class UltimateCrawler(SmartCrawler):
# ... (โค้ดเดิมจากขั้นตอนก่อนหน้า) ...

def save_to_csv(self, data_list, filename="scraped_data.csv"):
    """บันทึกข้อมูลรายการ Dict ลงไฟล์ CSV"""
    if not data_list:
        print("⚠️ ไม่มีข้อมูลให้บันทึก")
        return
    
    keys = data_list[0].keys() # ดึงชื่อหัวตารางจาก Keys ของ Dictionary
    try:
        with open(filename, 'w', newline='', encoding='utf-8-sig') as f:
            dict_writer = csv.DictWriter(f, fieldnames=keys)
            dict_writer.writeheader()
            dict_writer.writerows(data_list)
        print(f"✅ บันทึกไฟล์ CSV เรียบร้อย: {filename}")
    except Exception as e:
        print(f"❌ เกิดข้อผิดพลาดในการเขียน CSV: {e}")

def save_to_json(self, data_list, filename="scraped_data.json"):
    """บันทึกข้อมูลรายการ Dict ลงไฟล์ JSON"""
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data_list, f, ensure_ascii=False, indent=4)
        print(f"✅ บันทึกไฟล์ JSON เรียบร้อย: {filename}")
    except Exception as e:
        print(f"❌ เกิดข้อผิดพลาดในการเขียน JSON: {e}")

--- ตัวอย่างการรันกระบวนการทั้งหมด ---

crawler = UltimateCrawler()
all_scraped_results = []

สมมติว่านี่คือลูปการทำงานของบอท

urls = ["https://example.com/p1", "https://example.com/p2"]

for url in urls:
# 1. Scrape ข้อมูล
data = crawler.scrape_data(url)
# เพิ่ม URL เข้าไปในชุดข้อมูลด้วยเพื่อให้รู้ว่ามาจากไหน
data['url'] = url
all_scraped_results.append(data)

# 2. บันทึกประวัติลง SQLite (กันลืม)
crawler.mark_as_visited(url)

3. ส่งออกข้อมูล (Export)

crawler.save_to_csv(all_scraped_results)
crawler.save_to_json(all_scraped_results)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Robotslxml.txt #5108

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

Robotslxml.txt #5108

Uh oh!

Wichitrawinu Feb 10, 2026

--- การตั้งค่าและเริ่มการทำงาน ---

--- ตั้งเวลา (Scheduler) ---

ทำงานทุกวันเวลา 08:30 น.

หรือจะให้ทำทุกๆ 1 ชั่วโมงก็ได้:

schedule.every(1).hours.do(job)

--- การใช้งานจริง ---

1. ขุดหารูปแบบ URL ทั้งหมดจาก Sitemap หลัก

--- วิธีการนำไปใช้ในโปรเจ็กต์ ---

1. แสดงรายการ Sitemap ที่เจอ

2. เช็ค Crawl-delay

3. จำลองการทำงานแบบสุภาพ (Polite Crawling)

--- วิธีการนำมาประกอบร่าง (Integration Example) ---

สมมติว่าเราได้ sitemaps มาจาก ProfessionalRobotsChecker (โค้ดก่อนหน้า)

--- ตัวอย่างการรันกระบวนการทั้งหมด ---

สมมติว่านี่คือลูปการทำงานของบอท

3. ส่งออกข้อมูล (Export)

Replies: 0 comments

Wichitrawinu
Feb 10, 2026