diff --git a/readme.md b/readme.md index 794c3f0..c2c418b 100644 --- a/readme.md +++ b/readme.md @@ -27,7 +27,7 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.o #### Installation -##### Using pip +##### Using pip * run `pip install wiktionaryparser` ##### From Source @@ -55,6 +55,13 @@ A python project which downloads words from English Wiktionary ([en.wiktionary.o >>> parser.include_relation('alternative forms') ``` +```python +>>> word, categories = parser.fetch('test', return_categories=True) +>>> words = parser.fetch_category('English phrasebook') +>>> words, subcategories = parser.fetch_category('English phrasebook', + return_subcategories=True) +``` + #### Requirements - requests==2.20.0 diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py index 6497b23..10f5bab 100644 --- a/wiktionaryparser/core.py +++ b/wiktionaryparser/core.py @@ -107,7 +107,7 @@ def get_id_list(self, contents, content_type): id_list.append((content_index, content_id, text_to_check)) return id_list - def get_word_data(self, language): + def get_word_data(self, language, return_categories): contents = self.soup.find_all('span', {'class': 'toctext'}) word_contents = [] start_index = None @@ -137,7 +137,18 @@ def get_word_data(self, language): 'pronunciations': self.parse_pronunciations(word_contents), } json_obj_list = self.map_to_object(word_data) - return json_obj_list + if return_categories: + categories = self.parse_categories() + return json_obj_list, categories[1:] + else: + return json_obj_list + + def parse_categories(self): + categories_list = [] + catlinks = self.soup.find_all('div', {'class': 'catlinks'}) + if len(catlinks) == 1: + categories_list = [cat.text for cat in catlinks[0].find_all('a')] + return categories_list def parse_pronunciations(self, word_contents): pronunciation_id_list = self.get_id_list(word_contents, 'pronunciation') @@ -275,10 +286,46 @@ def map_to_object(self, word_data): json_obj_list.append(data_obj.to_json()) return json_obj_list - def fetch(self, word, language=None, old_id=None): + def parse_next_page_links(self, category): + link_tags = self.soup.find('div', {'id': 'mw-pages'}).find_all('a', {'title': category}) + return [link['href'] for link in link_tags if link.text == 'next page'] + + def parse_category_words(self): + words_content = self.soup.find('div', {'id': 'mw-pages'}).find('div', {'class': 'mw-content-ltr'}) + words = [word.text for word in words_content.find_all('a')] + return words + + def get_category_data(self, category, return_subcategories=False): + words = [] + next_page_links = self.parse_next_page_links(category) + while len(next_page_links) > 0: + words += self.parse_category_words() + response = self.session.get('https://en.wiktionary.org/' + next_page_links[0]) + self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser') + self.clean_html() + next_page_links = self.parse_next_page_links(category) + words += self.parse_category_words() + + if return_subcategories: + subcategories = [] + category_groups = self.soup.find('div', {'id': 'mw-subcategories'}).find_all('div', {'class': 'mw-category-group'}) + for category_group in category_groups: + subcategories += [cat.text for cat in category_group.find_all('a')] + return words, subcategories + else: + return words + + def fetch(self, word, language=None, old_id=None, return_categories=False): language = self.language if not language else language response = self.session.get(self.url.format(word), params={'oldid': old_id}) self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser') self.current_word = word self.clean_html() - return self.get_word_data(language.lower()) + return self.get_word_data(language.lower(), return_categories) + + def fetch_category(self, category, return_subcategories=False): + category = "Category:" + category + response = self.session.get(self.url.format(category)) + self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser') + self.clean_html() + return self.get_category_data(category, return_subcategories=return_subcategories) diff --git a/wiktionaryparser/utils.py b/wiktionaryparser/utils.py index 4920620..858322b 100644 --- a/wiktionaryparser/utils.py +++ b/wiktionaryparser/utils.py @@ -76,4 +76,4 @@ def to_json(self): return { 'relationshipType': self.relationship_type, 'words': self.words - } \ No newline at end of file + }