Skip to content

Commit 9054120

Browse files
Split integration tests to separate files and add test for salam pax's blog
1 parent b2f7543 commit 9054120

File tree

7 files changed

+46
-10
lines changed

7 files changed

+46
-10
lines changed

blog2epub/crawlers/article_factory/abstract.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ def __init__(
2121
downloader: Downloader,
2222
cancelled: bool = False,
2323
download_callback: Optional[Callable] = None,
24+
blog_title: Optional[str] = None,
25+
blog_description: Optional[str] = None,
2426
):
2527
self.url = url
2628
self.html: bytes = html_content
@@ -37,6 +39,8 @@ def __init__(
3739
self.comments = "" # TODO: should be a list in the future
3840
self.cancelled: bool = cancelled
3941
self.download_callback = download_callback
42+
self.blog_title: Optional[str] = blog_title
43+
self.blog_description: Optional[str] = blog_description
4044

4145
@abstractmethod
4246
def process(self) -> ArticleModel:
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from typing import Optional
2+
3+
from lxml.html.soupparser import fromstring
4+
5+
from blog2epub.crawlers.article_factory.default import DefaultArticleFactory
6+
from blog2epub.models.book import ArticleModel
7+
8+
9+
class BlogspotArticleFactory(DefaultArticleFactory):
10+
def get_title(self) -> Optional[str]:
11+
title = super().get_title()
12+
if self.blog_title is not None and title == self.blog_title:
13+
title = "Pomidor"
14+
return title
15+
16+
def process(self) -> ArticleModel:
17+
self.tree = fromstring(self.html)
18+
return ArticleModel(
19+
url=self.url,
20+
title=self.get_title(),
21+
date=self.get_date(),
22+
images=self.get_images(),
23+
tags=self.get_tags(),
24+
content=self.get_content(),
25+
comments=self.get_comments(),
26+
)

blog2epub/crawlers/article_factory/default.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,16 @@
1616

1717
class DefaultArticleFactory(AbstractArticleFactory):
1818
def get_title(self) -> Optional[str]:
19+
title = None
1920
if self.tree is not None and self.patterns is not None:
2021
for title_pattern in self.patterns.title:
2122
if title_pattern.xpath:
2223
title = self.tree.xpath(title_pattern.xpath)
23-
if len(title) > 0:
24+
if len(title) > 1:
2425
title = title[0]
25-
return html.unescape(title.strip())
26-
return None
26+
title = html.unescape(title.strip())
27+
break
28+
return title
2729

2830
def get_date(self) -> Optional[datetime]:
2931
result_date = None

blog2epub/crawlers/blogspot.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#!/usr/bin/env python3
22
# -*- coding : utf-8 -*-
3+
4+
from blog2epub.crawlers.article_factory.blogspot import BlogspotArticleFactory
35
from blog2epub.crawlers.default import DefaultCrawler
46
from blog2epub.models.content_patterns import Pattern
57

@@ -10,7 +12,7 @@ class BlogspotCrawler(DefaultCrawler):
1012
def __init__(self, **kwargs):
1113
super().__init__(**kwargs)
1214
self.name = "blogger.com crawler"
13-
15+
self.article_factory_class = BlogspotArticleFactory
1416
self.patterns.content.append(
1517
Pattern(
1618
xpath="//div[contains(@class, 'post-body')]",

blog2epub/crawlers/default.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ def crawl(self):
364364
language=self.language,
365365
downloader=self.downloader,
366366
download_callback=self._break_the_loop,
367+
blog_title=self.title,
367368
)
368369
art = art_factory.process()
369370
self.images = self.images + art.images

tests/integration/blog2epub/test_blog2epub_salam_pax.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,11 @@ def mock_configuration() -> ConfigurationModel:
1616
)
1717

1818

19-
class TestBlog2EPubMainSalamPax:
20-
def test_velosov_can_parse_the_date(self, mock_configuration):
19+
class TestBlog2EPubSalamPax:
20+
def test_salam_pax_get_different_titles(self, mock_configuration):
2121
# given
2222
given_blog2epub = Blog2Epub(
23-
url="dear_raed.blogspot.com",
23+
url="http://dear_raed.blogspot.com",
2424
interface=EmptyInterface(),
2525
configuration=mock_configuration,
2626
cache_folder="tests_cache",
@@ -32,6 +32,6 @@ def test_velosov_can_parse_the_date(self, mock_configuration):
3232
interface=EmptyInterface(),
3333
configuration=mock_configuration,
3434
)
35-
ebook.save()
3635
# then
37-
pass
36+
assert len(ebook.book_data.articles) == 2
37+
assert ebook.book_data.articles[0].title != ebook.book_data.articles[1].title

tests/integration/blog2epub/test_blog2epub_velosov.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,5 @@ def test_velosov_can_parse_the_date(self, mock_configuration):
3434
)
3535
ebook.save()
3636
# then
37-
pass
37+
assert len(ebook.book_data.articles) == 2
38+
assert ebook.book_data.articles[0].title != ebook.book_data.articles[1].title

0 commit comments

Comments
 (0)