Skip to content

Commit 10c2c32

Browse files
committed
Add site map handles [beta]
1 parent 43bc21b commit 10c2c32

File tree

2 files changed

+179
-32
lines changed

2 files changed

+179
-32
lines changed

src/ezweb/objects/soup.py

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from collections import Counter
22
import json
3+
import re
34
from bs4.element import Tag
45
from dateutil.parser import parse as date_parse
56
from trafilatura import extract
@@ -8,7 +9,14 @@
89
from cached_property import cached_property
910

1011
#
11-
from ezweb.utils.http import safe_get, soup_of, pure_url, name_from_url, url_host
12+
from ezweb.utils.http import (
13+
safe_get,
14+
safe_head,
15+
soup_of,
16+
pure_url,
17+
name_from_url,
18+
url_host,
19+
)
1220
from ezweb.utils.text import similarity_of, clean_title
1321
from ezweb.utils.souphelper import EzSoupHelper
1422
from ezweb.utils.io import create_file
@@ -20,6 +28,7 @@ def __init__(self, content: str, url: str = None) -> None:
2028
self.soup = soup_of(self.content)
2129
self.url = url
2230
self.helper = EzSoupHelper(self.soup, self.url)
31+
self.c = 0
2332

2433
@staticmethod
2534
def from_url(url: str):
@@ -34,10 +43,76 @@ def url_parts(self):
3443
def site_name_from_host(self):
3544
return name_from_url(self.url)
3645

46+
@cached_property
47+
def site_map_url(self):
48+
# if sitemap from robots.txt is provided return it
49+
if self.site_map_url_from_robots_txt:
50+
return self.site_map_url_from_robots_txt
51+
possibles = ["sitemap.xml", "sitemap_index.xml"]
52+
result = None
53+
for n in possibles:
54+
# lets check which sitemap is a valid sitemap URL
55+
url = self.root_url + n
56+
if safe_head(url).ok:
57+
result = url
58+
break
59+
return result
60+
61+
@cached_property
62+
def site_map_url_from_robots_txt(self):
63+
r = re.compile("Sitemap:(.+)")
64+
url = r.search(self.robots_txt).group(1)
65+
if not url:
66+
return None
67+
if not "https" in url:
68+
url = "https://" + url.split("://")[1]
69+
return url.strip()
70+
71+
@cached_property
72+
def site_map_product_links(self):
73+
return self.site_map_links(contain=["product"])
74+
75+
@cached_property
76+
def site_map_article_links(self):
77+
return self.site_map_links(contain=["article", "blog"])
78+
79+
def site_map_links(self, contain: list = None):
80+
soup = EzSoup.from_url(self.site_map_url)
81+
hrefs = self.helper.get_site_map_links(soup, contain=contain)
82+
not_xmls = []
83+
84+
def checker(link: str):
85+
dot_split = link.split(".")
86+
if dot_split:
87+
if dot_split[-1] == "xml":
88+
soup = EzSoup.from_url(link)
89+
children = self.helper.get_site_map_links(soup)
90+
not_xmls.extend(children)
91+
else:
92+
not_xmls.append(link)
93+
94+
with ThreadPoolExecutor() as e:
95+
e.map(checker, hrefs)
96+
97+
return list(set(not_xmls))
98+
99+
@cached_property
100+
def robots_txt(self):
101+
if not self.root_url:
102+
return
103+
url = self.root_url + "/robots.txt"
104+
return safe_get(url).text
105+
37106
@cached_property
38107
def root_domain(self):
39108
return url_host(self.url).replace("www.", "")
40109

110+
@cached_property
111+
def root_url(self):
112+
if not self.root_domain:
113+
return
114+
return "https://" + self.root_domain
115+
41116
@cached_property
42117
def title_tag_text(self):
43118
tag = self.helper.first("title")
@@ -413,3 +488,7 @@ def save_content_summary_json(self, path: str = None, custom_content: str = None
413488
def save_important_links(self, path: str = None):
414489
path = path or (self.title + ".txt")
415490
create_file(path, "\n".join(self.important_hrefs))
491+
492+
def save_site_map_links(self, contain: list = None, path: str = None):
493+
path = path or (self.title + ".txt")
494+
create_file(path, "\n".join(self.site_map_links(contain=contain)))

src/ezweb/utils/souphelper.py

Lines changed: 99 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import itertools
1010

1111
#
12-
from ezweb.utils.http import name_from_url
12+
from ezweb.utils.http import name_from_url, pure_url
1313
from ezweb.utils.text import clean_text, clean_title, similarity_of
1414

1515

@@ -46,7 +46,13 @@ def possible_topic_tags(self) -> List[Tag]:
4646
returns possible topic/breadcrump tags of webpage
4747
generated from soup (HTML) itself .
4848
"""
49+
# get some nav
50+
nav = []
51+
for n in self.all("nav"):
52+
if 1 > len(n.find_all("a" , href=True)) <= 4:
53+
nav.append(n)
4954

55+
5056
id_bread = self.all_contains("id", "breadcrumb")
5157
class_bread = self.all_contains("class", "breadcrumb")
5258
breads = id_bread + class_bread
@@ -56,10 +62,21 @@ def possible_topic_tags(self) -> List[Tag]:
5662
class_maybe = class_cat + class_tag
5763

5864
# avoid using not related tags
59-
if len(class_maybe) > 7:
65+
if len(class_maybe) > 6:
6066
class_maybe = []
61-
62-
maybe_elements_containers = breads + class_maybe
67+
68+
# avoid using not related tags
69+
for tag in breads :
70+
bread_a_tags= []
71+
if tag.name == "a" :
72+
bread_a_tags.append(tag)
73+
for a in tag.find_all("a"):
74+
bread_a_tags.append(a)
75+
if len(bread_a_tags) > 10 :
76+
breads = []
77+
78+
print("nav", len(nav), "breads", len(breads), "class_maybe", len(class_maybe))
79+
maybe_elements_containers = nav + breads + class_maybe
6380
maybe_elements = []
6481

6582
# filling maybe_elements with all <a> in selected parents (containers)
@@ -73,26 +90,36 @@ def possible_topic_tags(self) -> List[Tag]:
7390
article_ul_tag = article.find("ul") if article else None
7491
article_ul_a = article_ul_tag.find_all("a") if article_ul_tag else []
7592

93+
print("maybe" , len(maybe_elements) , "article_ul" , len(article_ul_a))
7694
tags = maybe_elements + article_ul_a
7795
return tags
7896

7997
@cached_property
8098
def table_info(self):
81-
t = self.first("table")
82-
if not t:
83-
return []
84-
rows = t.find_all("tr")
85-
if not rows:
86-
return []
87-
data = [
88-
{
89-
self.tag_text(head): self.tag_text(cell)
90-
for cell in row.find_all("td")
91-
for head in row.find_all("th")
92-
}
93-
for row in rows
94-
]
95-
return data
99+
tables = self.all("table")
100+
result = []
101+
for table in tables:
102+
if not table:
103+
continue
104+
rows = table.find_all("tr")
105+
if not rows:
106+
return []
107+
for row in rows:
108+
cells = row.find_all("td")
109+
headers = row.find_all("th")
110+
if not cells or not headers:
111+
break
112+
for head, cell in zip(headers, cells):
113+
ht = self.tag_text(head)
114+
ct = self.tag_text(cell)
115+
if ht == ct:
116+
break
117+
d = {ht: ct}
118+
if not d:
119+
break
120+
result.append(d)
121+
122+
return result
96123

97124
@cached_property
98125
def possible_topic_names(self):
@@ -106,7 +133,7 @@ def possible_topic_names(self):
106133
return list(set(result))
107134

108135
@cached_property
109-
def address(self):
136+
def addresses(self):
110137
classes = ["address", "location", "contact"]
111138
words = [
112139
"آدرس",
@@ -122,14 +149,17 @@ def address(self):
122149
"تفاطع",
123150
]
124151
#
152+
def _result(res: list):
153+
return sorted(res)
154+
125155
def _texts_of(tags):
126156
return list({clean_text(t.text) for t in tags if t.text})
127157

128158
ad_tags = self.all("address")
129159
if ad_tags:
130160
texts = _texts_of(ad_tags)
131161
if texts:
132-
return texts[0]
162+
return _result(texts)
133163

134164
def _f(class_name):
135165
"""Returns all `class_name` like tags in the footer"""
@@ -141,18 +171,23 @@ def _f(class_name):
141171

142172
if tags:
143173
texts = _texts_of(tags)
144-
return texts[0] if texts else None
145-
174+
return _result(texts) if texts else None
146175
else:
147176
# searching
148177
footer = self.all("footer")[-1]
149178
if not footer:
150179
return None
151180
for w in words:
152181
search = footer.find_all(text=True)
153-
texts = list({clean_text(text) for text in search if w in text})
182+
texts = list(
183+
{
184+
clean_text(text)
185+
for text in search
186+
if w in text and len(text) >= 45
187+
}
188+
)
154189
if texts:
155-
return texts[0]
190+
return _result(texts)
156191

157192
@cached_property
158193
def question_answers(self):
@@ -164,14 +199,14 @@ def question_answers(self):
164199
def _bad_topic_names(self):
165200
vocab = {
166201
"fa": ["فروشگاه", "خانه", "صفحه اصلی", "برگشت", "بازگشت"],
167-
"en": ["home", "return", "back", "undo", "shop"],
202+
"en": ["home", "return", "back", "undo", "shop" , "change"],
168203
}
169204
# merge all d values list into one list of str
170205
result = list(itertools.chain.from_iterable(vocab.values()))
171206
return result
172207

173208
@cached_property
174-
def application_json(self):
209+
def application_json(self) -> dict:
175210
all_json_tags = self.all("script", attrs={"type": "application/ld+json"})
176211
if not all_json_tags:
177212
return None
@@ -229,6 +264,27 @@ def contains(self, tag_name: str, attr: str, value: str):
229264
"""
230265
return self.xpath(f'{tag_name}[{attr}*="{value}"]')
231266

267+
def get_site_map_links(self, soup, contain: list = None):
268+
hrefs = soup.a_tag_hrefs
269+
if not hrefs or len(hrefs) < 3:
270+
locs = soup.helper.all("loc")
271+
if locs:
272+
hrefs = [self.tag_text(t) for t in locs]
273+
if contain:
274+
275+
def contain_cond(url: str):
276+
for w in contain:
277+
w = w.lower()
278+
parts = pure_url(url)
279+
if len(parts) >= 2 and w in parts[1].lower():
280+
return True
281+
if len(parts) >= 3 and w in parts[2].lower():
282+
return True
283+
return False
284+
285+
hrefs = [l for l in hrefs if contain_cond(l)]
286+
return hrefs
287+
232288
def linked_files(self, extension: str):
233289
"""
234290
returns all `<a>` tags that their `href` contains `.extension`
@@ -243,14 +299,26 @@ def linked_files(self, extension: str):
243299
"""
244300
return self.contains("a", "href", f".{extension}")
245301

246-
def from_structured_data(self, key: str):
302+
def from_structured_data(
303+
self, key: str, single: bool = False, unique: bool = False
304+
):
247305
"""
248-
https://developers.google.com/search/docs/advanced/structured-data/
306+
Guide : https://developers.google.com/search/docs/advanced/structured-data/intro-structured-data
307+
308+
Test a URL : https://developers.google.com/search/docs/advanced/structured-data/
249309
"""
250310
from_json_ld = self.from_json_schema(key)
251311
# TODO: providing other structured schemas like
252312
# RDFa and Microdata
253-
return from_json_ld
313+
result = from_json_ld
314+
if unique:
315+
result = list(set(result))
316+
if single:
317+
if result:
318+
return result[0]
319+
else:
320+
return
321+
return result
254322

255323
def from_json_schema(self, key: str):
256324
"""
@@ -289,7 +357,7 @@ def _ok_topic_name(self, name: str):
289357
if not name or name == "" or len(name) > 26:
290358
# print("Null topic name or many charachters")
291359
return False
292-
if name in self._bad_topic_names:
360+
if name.split()[0] in self._bad_topic_names:
293361
return False
294362
site_name = self.site_name
295363
msg = f"| name : {name} , site name : {site_name}"

0 commit comments

Comments
 (0)