diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 8c3c94b5ff..54c721eed2 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -576,7 +576,8 @@ class WikiCorpus(TextCorpus): """ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, - token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None): + token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None, + metadata=False): """Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, @@ -612,6 +613,8 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction If set, each XML article element will be passed to this callable before being processed. Only articles where the callable returns an XML element are processed, returning None allows filtering out some articles based on customised rules. + metadata: bool, optional + Whether to write articles titles to serialized corpus. Warnings -------- @@ -621,7 +624,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction self.fname = fname self.filter_namespaces = filter_namespaces self.filter_articles = filter_articles - self.metadata = False + self.metadata = metadata if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes @@ -631,9 +634,8 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction self.token_min_len = token_min_len self.token_max_len = token_max_len self.lower = lower - if dictionary is None: - self.dictionary = Dictionary(self.get_texts()) + self.dictionary = Dictionary(self.get_texts(dictionary_mode=True)) else: self.dictionary = dictionary @@ -641,7 +643,7 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction def input(self): return self.fname - def get_texts(self): + def get_texts(self, dictionary_mode=False): """Iterate over the dump, yielding a list of tokens for each article that passed the length and namespace filtering. @@ -651,6 +653,12 @@ def get_texts(self): ----- This iterates over the **texts**. If you want vectors, just use the standard corpus interface instead of this method: + + Parameters + ---------- + dictionary_mode : bool, optional + If True, yields list of str. + If False, yield depends on self.metadata (see 'Yields' below). Examples -------- @@ -696,7 +704,7 @@ def get_texts(self): continue articles += 1 positions += len(tokens) - if self.metadata: + if self.metadata and not dictionary_mode: yield (tokens, (pageid, title)) else: yield tokens