diff --git a/deepwalk/__main__.py b/deepwalk/__main__.py index 1070e07..f1c6709 100644 --- a/deepwalk/__main__.py +++ b/deepwalk/__main__.py @@ -14,6 +14,7 @@ from . import walks as serialized_walks from gensim.models import Word2Vec from .skipgram import Skipgram +from . import phrases from six import text_type as unicode from six import iteritems @@ -71,6 +72,11 @@ def process(args): print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) + + if args.ngram > 1: + print("Building n-gram with n="+str(args.ngram)+"...") + walks = phrases.build_ngram(walks, args.ngram) + print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) else: @@ -89,8 +95,13 @@ def process(args): # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) - print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) + + if args.ngram > 1: + print("Building n-gram with n="+str(args.ngram)+"...") + walks_corpus = phrases.build_ngram(walks_corpus, args.ngram) + + print("Training...") model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) @@ -150,6 +161,8 @@ def main(): parser.add_argument('--workers', default=1, type=int, help='Number of parallel processes.') + parser.add_argument('--ngram', default=1, type=int, + help='N of n-grams, e.g.: set 2 for bigrams, 3 for trigrams, etc.') args = parser.parse_args() numeric_level = getattr(logging, args.log.upper(), None) diff --git a/deepwalk/phrases.py b/deepwalk/phrases.py new file mode 100644 index 0000000..48fbe8f --- /dev/null +++ b/deepwalk/phrases.py @@ -0,0 +1,36 @@ +import logging + +from gensim.models.phrases import Phrases, Phraser + +logger = logging.getLogger("deepwalk") + +def build_ngram(walks, ngram, min_count=5, threshold=10.0, + max_vocab_size=40000000, delimiter=b'_', scoring='default'): + """ + Compose n-gram on the fly given tunable parameters, work for both in-memory or out-of-core computations. + + Required Parameters + - walks: iterable list of str (iterable list of list of string, or deepwalk.walks.WalksCorpus object) + Input random walk sequences. Can be either 'List of list of tokens'(in-memory) or 'deepwalk.walks.WalksCorpus' object(out-of-core) + - ngram: int + Specify the n of n-gram, e.g.: ngram=2 to compose bigrams. + + Optional Parameters + Referece to gensim.models.phrases.Phrases + + Return + - iterable list of str (iterable list of list of string, or deepwalk.walks.WalksCorpus object) + """ + + if ngram<2: + logger.warning("ngram must >=2! Skip building ngram.") + return walks + + for n in range(2,ngram+1): + logger.info("Composing "+str(n)+"-grams...") + ngram_phrases = Phrases(walks, min_count=min_count, threshold=threshold, max_vocab_size=max_vocab_size, + delimiter=delimiter, scoring=scoring) + ngram_phraser = Phraser(ngram_phrases) + walks = ngram_phraser[walks] + + return walks