99import itertools
1010
1111#
12- from ezweb .utils .http import name_from_url
12+ from ezweb .utils .http import name_from_url , pure_url
1313from ezweb .utils .text import clean_text , clean_title , similarity_of
1414
1515
@@ -46,7 +46,13 @@ def possible_topic_tags(self) -> List[Tag]:
4646 returns possible topic/breadcrump tags of webpage
4747 generated from soup (HTML) itself .
4848 """
49+ # get some nav
50+ nav = []
51+ for n in self .all ("nav" ):
52+ if 1 > len (n .find_all ("a" , href = True )) <= 4 :
53+ nav .append (n )
4954
55+
5056 id_bread = self .all_contains ("id" , "breadcrumb" )
5157 class_bread = self .all_contains ("class" , "breadcrumb" )
5258 breads = id_bread + class_bread
@@ -56,10 +62,21 @@ def possible_topic_tags(self) -> List[Tag]:
5662 class_maybe = class_cat + class_tag
5763
5864 # avoid using not related tags
59- if len (class_maybe ) > 7 :
65+ if len (class_maybe ) > 6 :
6066 class_maybe = []
61-
62- maybe_elements_containers = breads + class_maybe
67+
68+ # avoid using not related tags
69+ for tag in breads :
70+ bread_a_tags = []
71+ if tag .name == "a" :
72+ bread_a_tags .append (tag )
73+ for a in tag .find_all ("a" ):
74+ bread_a_tags .append (a )
75+ if len (bread_a_tags ) > 10 :
76+ breads = []
77+
78+ print ("nav" , len (nav ), "breads" , len (breads ), "class_maybe" , len (class_maybe ))
79+ maybe_elements_containers = nav + breads + class_maybe
6380 maybe_elements = []
6481
6582 # filling maybe_elements with all <a> in selected parents (containers)
@@ -73,26 +90,36 @@ def possible_topic_tags(self) -> List[Tag]:
7390 article_ul_tag = article .find ("ul" ) if article else None
7491 article_ul_a = article_ul_tag .find_all ("a" ) if article_ul_tag else []
7592
93+ print ("maybe" , len (maybe_elements ) , "article_ul" , len (article_ul_a ))
7694 tags = maybe_elements + article_ul_a
7795 return tags
7896
7997 @cached_property
8098 def table_info (self ):
81- t = self .first ("table" )
82- if not t :
83- return []
84- rows = t .find_all ("tr" )
85- if not rows :
86- return []
87- data = [
88- {
89- self .tag_text (head ): self .tag_text (cell )
90- for cell in row .find_all ("td" )
91- for head in row .find_all ("th" )
92- }
93- for row in rows
94- ]
95- return data
99+ tables = self .all ("table" )
100+ result = []
101+ for table in tables :
102+ if not table :
103+ continue
104+ rows = table .find_all ("tr" )
105+ if not rows :
106+ return []
107+ for row in rows :
108+ cells = row .find_all ("td" )
109+ headers = row .find_all ("th" )
110+ if not cells or not headers :
111+ break
112+ for head , cell in zip (headers , cells ):
113+ ht = self .tag_text (head )
114+ ct = self .tag_text (cell )
115+ if ht == ct :
116+ break
117+ d = {ht : ct }
118+ if not d :
119+ break
120+ result .append (d )
121+
122+ return result
96123
97124 @cached_property
98125 def possible_topic_names (self ):
@@ -106,7 +133,7 @@ def possible_topic_names(self):
106133 return list (set (result ))
107134
108135 @cached_property
109- def address (self ):
136+ def addresses (self ):
110137 classes = ["address" , "location" , "contact" ]
111138 words = [
112139 "آدرس" ,
@@ -122,14 +149,17 @@ def address(self):
122149 "تفاطع" ,
123150 ]
124151 #
152+ def _result (res : list ):
153+ return sorted (res )
154+
125155 def _texts_of (tags ):
126156 return list ({clean_text (t .text ) for t in tags if t .text })
127157
128158 ad_tags = self .all ("address" )
129159 if ad_tags :
130160 texts = _texts_of (ad_tags )
131161 if texts :
132- return texts [ 0 ]
162+ return _result ( texts )
133163
134164 def _f (class_name ):
135165 """Returns all `class_name` like tags in the footer"""
@@ -141,18 +171,23 @@ def _f(class_name):
141171
142172 if tags :
143173 texts = _texts_of (tags )
144- return texts [0 ] if texts else None
145-
174+ return _result (texts ) if texts else None
146175 else :
147176 # searching
148177 footer = self .all ("footer" )[- 1 ]
149178 if not footer :
150179 return None
151180 for w in words :
152181 search = footer .find_all (text = True )
153- texts = list ({clean_text (text ) for text in search if w in text })
182+ texts = list (
183+ {
184+ clean_text (text )
185+ for text in search
186+ if w in text and len (text ) >= 45
187+ }
188+ )
154189 if texts :
155- return texts [ 0 ]
190+ return _result ( texts )
156191
157192 @cached_property
158193 def question_answers (self ):
@@ -164,14 +199,14 @@ def question_answers(self):
164199 def _bad_topic_names (self ):
165200 vocab = {
166201 "fa" : ["فروشگاه" , "خانه" , "صفحه اصلی" , "برگشت" , "بازگشت" ],
167- "en" : ["home" , "return" , "back" , "undo" , "shop" ],
202+ "en" : ["home" , "return" , "back" , "undo" , "shop" , "change" ],
168203 }
169204 # merge all d values list into one list of str
170205 result = list (itertools .chain .from_iterable (vocab .values ()))
171206 return result
172207
173208 @cached_property
174- def application_json (self ):
209+ def application_json (self ) -> dict :
175210 all_json_tags = self .all ("script" , attrs = {"type" : "application/ld+json" })
176211 if not all_json_tags :
177212 return None
@@ -229,6 +264,27 @@ def contains(self, tag_name: str, attr: str, value: str):
229264 """
230265 return self .xpath (f'{ tag_name } [{ attr } *="{ value } "]' )
231266
267+ def get_site_map_links (self , soup , contain : list = None ):
268+ hrefs = soup .a_tag_hrefs
269+ if not hrefs or len (hrefs ) < 3 :
270+ locs = soup .helper .all ("loc" )
271+ if locs :
272+ hrefs = [self .tag_text (t ) for t in locs ]
273+ if contain :
274+
275+ def contain_cond (url : str ):
276+ for w in contain :
277+ w = w .lower ()
278+ parts = pure_url (url )
279+ if len (parts ) >= 2 and w in parts [1 ].lower ():
280+ return True
281+ if len (parts ) >= 3 and w in parts [2 ].lower ():
282+ return True
283+ return False
284+
285+ hrefs = [l for l in hrefs if contain_cond (l )]
286+ return hrefs
287+
232288 def linked_files (self , extension : str ):
233289 """
234290 returns all `<a>` tags that their `href` contains `.extension`
@@ -243,14 +299,26 @@ def linked_files(self, extension: str):
243299 """
244300 return self .contains ("a" , "href" , f".{ extension } " )
245301
246- def from_structured_data (self , key : str ):
302+ def from_structured_data (
303+ self , key : str , single : bool = False , unique : bool = False
304+ ):
247305 """
248- https://developers.google.com/search/docs/advanced/structured-data/
306+ Guide : https://developers.google.com/search/docs/advanced/structured-data/intro-structured-data
307+
308+ Test a URL : https://developers.google.com/search/docs/advanced/structured-data/
249309 """
250310 from_json_ld = self .from_json_schema (key )
251311 # TODO: providing other structured schemas like
252312 # RDFa and Microdata
253- return from_json_ld
313+ result = from_json_ld
314+ if unique :
315+ result = list (set (result ))
316+ if single :
317+ if result :
318+ return result [0 ]
319+ else :
320+ return
321+ return result
254322
255323 def from_json_schema (self , key : str ):
256324 """
@@ -289,7 +357,7 @@ def _ok_topic_name(self, name: str):
289357 if not name or name == "" or len (name ) > 26 :
290358 # print("Null topic name or many charachters")
291359 return False
292- if name in self ._bad_topic_names :
360+ if name . split ()[ 0 ] in self ._bad_topic_names :
293361 return False
294362 site_name = self .site_name
295363 msg = f"| name : { name } , site name : { site_name } "
0 commit comments