diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py index 49f6617..cd3560b 100644 --- a/wiktionaryparser/core.py +++ b/wiktionaryparser/core.py @@ -148,23 +148,27 @@ def parse_pronunciations(self, word_contents): pronunciation_text = [] span_tag = self.soup.find_all('span', {'id': pronunciation_id})[0] list_tag = span_tag.parent + + list_elements = [] while list_tag.name != 'ul': list_tag = list_tag.find_next_sibling() - if list_tag.name == 'p': - pronunciation_text.append(list_tag.text) - break - if list_tag.name == 'div' and any(_ in pronunciation_div_classes for _ in list_tag['class']): - break - for super_tag in list_tag.find_all('sup'): - super_tag.clear() - for list_element in list_tag.find_all('li'): + if list_tag.name == 'p': break + list_elements += list_tag.find_all('li') + + for list_element in list_elements: + for super_tag in list_element.find_all('sup'): + super_tag.clear() + for audio_tag in list_element.find_all('div', {'class': 'mediaContainer'}): audio_links.append(audio_tag.find('source')['src']) audio_tag.extract() + for nested_list_element in list_element.find_all('ul'): nested_list_element.extract() + if list_element.text and not list_element.find('table', {'class': 'audiotable'}): pronunciation_text.append(list_element.text.strip()) + pronunciation_list.append((pronunciation_index, pronunciation_text, audio_links)) return pronunciation_list