From bd47d0a7048a704f949c61edc9a428157a09626e Mon Sep 17 00:00:00 2001 From: timf34 Date: Sat, 10 May 2025 20:38:04 +0100 Subject: [PATCH 1/2] updated date selector --- substack_scraper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/substack_scraper.py b/substack_scraper.py index a1dc5534..e0c92a32 100644 --- a/substack_scraper.py +++ b/substack_scraper.py @@ -257,9 +257,10 @@ def extract_post_data(self, soup: BeautifulSoup) -> Tuple[str, str, str, str, st subtitle_element = soup.select_one("h3.subtitle") subtitle = subtitle_element.text.strip() if subtitle_element else "" + date_element = soup.find( "div", - class_="pencraft pc-reset _color-pub-secondary-text_1xu16_194 _line-height-20_1xu16_81 _font-meta_1xu16_116 _size-11_1xu16_32 _weight-medium_1xu16_146 _transform-uppercase_1xu16_241 _reset_1xu16_2 _meta_1xu16_441" + class_="pencraft pc-reset color-pub-secondary-text-hGQ02T line-height-20-t4M0El font-meta-MWBumP size-11-NuY2Zx weight-medium-fw81nC transform-uppercase-yKDgcq reset-IxiVJZ meta-EgzBVA" ) date = date_element.text.strip() if date_element else "Date not found" From 143fcb8232acdd2a5f8d2af0fd488b18e9aec0ef Mon Sep 17 00:00:00 2001 From: dsouzaankit Date: Tue, 12 Aug 2025 18:40:29 -0400 Subject: [PATCH 2/2] selenium edge url not found fix --- substack_scraper.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/substack_scraper.py b/substack_scraper.py index e0c92a32..ad891d58 100644 --- a/substack_scraper.py +++ b/substack_scraper.py @@ -20,13 +20,13 @@ from urllib.parse import urlparse from config import EMAIL, PASSWORD -USE_PREMIUM: bool = False # Set to True if you want to login to Substack and convert paid for posts -BASE_SUBSTACK_URL: str = "https://www.thefitzwilliam.com/" # Substack you want to convert to markdown +USE_PREMIUM: bool = True # Set to True if you want to login to Substack and convert paid for posts +BASE_SUBSTACK_URL: str = "https://premSubstk.io/" # Substack you want to convert to markdown BASE_MD_DIR: str = "substack_md_files" # Name of the directory we'll save the .md essay files BASE_HTML_DIR: str = "substack_html_pages" # Name of the directory we'll save the .html essay files HTML_TEMPLATE: str = "author_template.html" # HTML template to use for the author page JSON_DATA_DIR: str = "data" -NUM_POSTS_TO_SCRAPE: int = 3 # Set to 0 if you want all posts +NUM_POSTS_TO_SCRAPE: int = 0 # Set to 0 if you want all posts def extract_main_part(url: str) -> str: @@ -381,12 +381,14 @@ def __init__( if user_agent: options.add_argument(f'user-agent={user_agent}') # Pass this if running headless and blocked by captcha - if edge_driver_path: - service = Service(executable_path=edge_driver_path) - else: - service = Service(EdgeChromiumDriverManager().install()) + # if edge_driver_path: + # service = Service(executable_path=edge_driver_path) + # else: + # service = Service(EdgeChromiumDriverManager().install()) - self.driver = webdriver.Edge(service=service, options=options) + os.environ["SE_DRIVER_MIRROR_URL"] = "https://msedgedriver.microsoft.com" + self.driver = webdriver.Edge() + # self.driver = webdriver.Edge(service=service, options=options) self.login() def login(self) -> None: