@@ -73,7 +73,7 @@ def _prep_records(trs, ths, sep=' / '):
7373
7474 for td_no , td in enumerate (tds ):
7575 if td .find ('td' ):
76- text_ = td .find ('a' ).contents + ["\t \t / " ]
76+ text_ = [ '' ] if td . find ( 'a' ) is None else td .find ('a' ).contents + ["\t \t / " ]
7777 else :
7878 text_ = [_parse_other_tags_in_td_contents (x ) for x in td .contents ]
7979 # _move_element_to_end(text_, char='\t\t')
@@ -315,6 +315,10 @@ def parse_date(str_date, as_date_type=False):
315315# == Extract information ===========================================================================
316316
317317
318+ def _clean_key (k_text ):
319+ return k_text .replace ("–" , "-" ).strip ("()" ).removesuffix (".shtml" ).removesuffix (".shtm" )
320+
321+
318322def _parse_dd_or_dt (dd_or_dt ):
319323 """
320324 Extracts text and href attributes from dt or dd elements.
@@ -339,50 +343,37 @@ def _parse_dd_or_dt(dd_or_dt):
339343 # text = f'{text[1].upper()}{text[2:-1]}'
340344 href = a_href .find ('a' ).get ('href' )
341345
342- return text . replace ( "–" , "-" ), href
346+ return _clean_key ( text ), href
343347
344348
345349def _get_site_map_h3_dl_dt_dds (h3_dl_dt , next_dd = None ):
346350 if next_dd is None :
347351 next_dd = h3_dl_dt .find_next ('dd' )
348352
349- prev_dt = next_dd .find_previous (name = 'dt' )
353+ prev_dt = next_dd .find_previous ('dt' )
350354
351355 h3_dl_dt_dds = {}
352356 while prev_dt == h3_dl_dt :
353357 next_dd_sub_dl_ = next_dd .find ('dl' )
354358
355- if next_dd_sub_dl_ is None :
356- next_dd_contents = [x for x in next_dd .contents if x != '\n ' ]
357-
358- if len (next_dd_contents ) == 1 :
359- next_dd_content = next_dd_contents [0 ]
360- text = next_dd_content .get_text (strip = True )
361- href = next_dd_content .get (key = 'href' )
362-
363- else : # len(next_dd_contents) == 2:
364- a_href , text = next_dd_contents
365- if not isinstance (text , str ):
366- text , a_href = next_dd_contents
367-
368- href = a_href .find (name = 'a' ).get (key = 'href' )
369-
370- h3_dl_dt_dds .update (
371- {text .replace ("–" , "-" ): urllib .parse .urljoin (home_page_url (), href )})
372-
373- else :
374- sub_dts = next_dd_sub_dl_ .find_all (name = 'dt' )
359+ if next_dd_sub_dl_ :
360+ sub_dts = next_dd_sub_dl_ .find_all ('dt' )
375361
376362 for sub_dt in sub_dts :
377363 sub_dt_text , _ = _parse_dd_or_dt (sub_dt )
378- sub_dt_dds = sub_dt .find_next_siblings (name = 'dd' )
364+ sub_dt_dds = sub_dt .find_next_siblings ('dd' )
379365 sub_dt_dds_dict = _get_site_map_sub_dl (h3_dl_dts = sub_dt_dds )
380366
381- h3_dl_dt_dds .update ({sub_dt_text .replace ("–" , "-" ): sub_dt_dds_dict })
367+ h3_dl_dt_dds .update ({_clean_key (sub_dt_text ): sub_dt_dds_dict })
368+
369+ else :
370+ a = next_dd .find ('a' )
371+ text , href = _clean_key (a .get_text (strip = True )), a .get (key = 'href' )
372+ h3_dl_dt_dds .update ({text : urllib .parse .urljoin (home_page_url (), href )})
382373
383374 try :
384- next_dd = next_dd .find_next_sibling (name = 'dd' )
385- prev_dt = next_dd .find_previous_sibling (name = 'dt' )
375+ next_dd = next_dd .find_next_sibling ('dd' )
376+ prev_dt = next_dd .find_previous_sibling ('dt' )
386377 except AttributeError :
387378 break
388379
@@ -397,19 +388,20 @@ def _get_site_map_sub_dl(h3_dl_dts):
397388 h3_dl_dt_dd_dict = {}
398389
399390 for h3_dl_dt in h3_dl_dts :
400- dt_text , dt_href = _parse_dd_or_dt (dd_or_dt = h3_dl_dt )
391+ dt_text_ , dt_href = _parse_dd_or_dt (dd_or_dt = h3_dl_dt )
392+ dt_text = _clean_key (dt_text_ )
401393
402394 if dt_href :
403395 h3_dl_dt_dd_dict .update ({dt_text : urllib .parse .urljoin (home_page_url (), dt_href )})
404396
405397 else :
406398 next_dd = h3_dl_dt .find_next ('dd' )
407- next_dd_sub_dl = next_dd .find (name = 'dd ' )
399+ next_dd_sub_dl = next_dd .find ('dl ' )
408400
409401 if next_dd_sub_dl :
410402 # next_dd_sub_dl_dts = next_dd_sub_dl.find_all(name='dt')
411403 next_dd_sub_dl_dts = [
412- dt for dt in next_dd .find_all (name = 'dt' ) if dt .has_attr ('class' )]
404+ dt for dt in next_dd .find_all ('dt' ) if dt .has_attr ('class' )]
413405 h3_dl_dt_dd_dict .update ({dt_text : _get_site_map_sub_dl (next_dd_sub_dl_dts )})
414406
415407 else :
@@ -427,11 +419,11 @@ def _get_site_map(source, parser='html.parser'):
427419 soup = bs4 .BeautifulSoup (markup = source .content , features = parser )
428420 site_map = {}
429421
430- h3s = soup .find_all (name = 'h3' , attrs = {"class" : "site" })
422+ h3s = soup .find_all ('h3' , attrs = {"class" : "site" })
431423
432424 for h3 in h3s :
433425 h3_title = h3 .get_text (strip = True )
434- h3_dl_dts = h3 .find_next (name = 'dl' ).find_all (name = 'dt' ) # h3 > dl > dt
426+ h3_dl_dts = h3 .find_next ('dl' ).find_all ('dt' ) # h3 > dl > dt
435427
436428 if len (h3_dl_dts ) == 1 :
437429 dd_dict = {} # h3 > dl > dt > dd
@@ -442,12 +434,12 @@ def _get_site_map(source, parser='html.parser'):
442434 if h3_dl_dt_text == '' :
443435 for dd in h3_dl_dt .find_next_siblings ('dd' ):
444436 text , href = _parse_dd_or_dt (dd )
445- dd_dict .update ({text : urllib .parse .urljoin (home_page_url (), href )})
437+ dd_dict .update ({_clean_key ( text ) : urllib .parse .urljoin (home_page_url (), href )})
446438
447439 else :
448440 dd_dict = _get_site_map_sub_dl (h3_dl_dts = h3_dl_dts )
449441
450- site_map .update ({h3_title : dd_dict })
442+ site_map .update ({_clean_key ( h3_title ) : dd_dict })
451443
452444 # noinspection SpellCheckingInspection
453445 site_map = update_dict_keys (
@@ -471,23 +463,23 @@ def get_site_map(update=False, confirmation_required=True, verbose=False, raise_
471463 :param raise_error: Whether to raise the provided exception;
472464 if ``raise_error=False``, the error will be suppressed; defaults to ``True``.
473465 :type raise_error: bool
474- :return: An ordered dictionary containing the data of site map.
475- :rtype: collections.OrderedDict | None
466+ :return: A dictionary containing the data of site map.
467+ :rtype: dict | None
476468
477469 **Examples**::
478470
479471 >>> from pyrcs.parser import get_site_map
480472 >>> site_map = get_site_map()
481473 >>> type(site_map)
482- collections.OrderedDict
474+ dict
483475 >>> list(site_map.keys())
484476 ['Home',
485477 'Line data',
486478 'Other assets',
487479 '"Legal/financial" lists',
488480 'Miscellaneous']
489481 >>> site_map['Home']
490- {'index.shtml ': 'http://www.railwaycodes.org.uk/index.shtml'}
482+ {'index': 'http://www.railwaycodes.org.uk/index.shtml'}
491483 """
492484
493485 path_to_file = cd_data ("site-map.json" , mkdir = True )
@@ -691,9 +683,12 @@ def get_introduction(url, delimiter='\n', update=False, verbose=False, raise_err
691683
692684 try :
693685 source = requests .get (url = url , headers = fake_requests_headers ())
694- except requests .exceptions .ConnectionError :
695- print_inst_conn_err (update = update , verbose = True if update else verbose )
696- return None
686+ except requests .exceptions .ConnectionError as e :
687+ if raise_error :
688+ raise e # Raise the original connection error
689+ else :
690+ print_inst_conn_err (update = update , verbose = True if update else verbose , e = e )
691+ return None
697692
698693 try :
699694 introduction = _parse_introduction (source = source , delimiter = delimiter )
0 commit comments