Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion deepwalk/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from . import walks as serialized_walks
from gensim.models import Word2Vec
from .skipgram import Skipgram
from . import phrases

from six import text_type as unicode
from six import iteritems
Expand Down Expand Up @@ -71,6 +72,11 @@ def process(args):
print("Walking...")
walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))

if args.ngram > 1:
print("Building n-gram with n="+str(args.ngram)+"...")
walks = phrases.build_ngram(walks, args.ngram)

print("Training...")
model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers)
else:
Expand All @@ -89,8 +95,13 @@ def process(args):
# use degree distribution for frequency in tree
vertex_counts = G.degree(nodes=G.iterkeys())

print("Training...")
walks_corpus = serialized_walks.WalksCorpus(walk_files)

if args.ngram > 1:
print("Building n-gram with n="+str(args.ngram)+"...")
walks_corpus = phrases.build_ngram(walks_corpus, args.ngram)

print("Training...")
model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
size=args.representation_size,
window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)
Expand Down Expand Up @@ -150,6 +161,8 @@ def main():
parser.add_argument('--workers', default=1, type=int,
help='Number of parallel processes.')

parser.add_argument('--ngram', default=1, type=int,
help='N of n-grams, e.g.: set 2 for bigrams, 3 for trigrams, etc.')

args = parser.parse_args()
numeric_level = getattr(logging, args.log.upper(), None)
Expand Down
36 changes: 36 additions & 0 deletions deepwalk/phrases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import logging

from gensim.models.phrases import Phrases, Phraser

logger = logging.getLogger("deepwalk")

def build_ngram(walks, ngram, min_count=5, threshold=10.0,
max_vocab_size=40000000, delimiter=b'_', scoring='default'):
"""
Compose n-gram on the fly given tunable parameters, work for both in-memory or out-of-core computations.

Required Parameters
- walks: iterable list of str (iterable list of list of string, or deepwalk.walks.WalksCorpus object)
Input random walk sequences. Can be either 'List of list of tokens'(in-memory) or 'deepwalk.walks.WalksCorpus' object(out-of-core)
- ngram: int
Specify the n of n-gram, e.g.: ngram=2 to compose bigrams.

Optional Parameters
Referece to gensim.models.phrases.Phrases

Return
- iterable list of str (iterable list of list of string, or deepwalk.walks.WalksCorpus object)
"""

if ngram<2:
logger.warning("ngram must >=2! Skip building ngram.")
return walks

for n in range(2,ngram+1):
logger.info("Composing "+str(n)+"-grams...")
ngram_phrases = Phrases(walks, min_count=min_count, threshold=threshold, max_vocab_size=max_vocab_size,
delimiter=delimiter, scoring=scoring)
ngram_phraser = Phraser(ngram_phrases)
walks = ngram_phraser[walks]

return walks