text-regenerator/TextRegenerator.py at master · jddunn/text-regenerator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# Given an existing complete statement or question, generates variations of the sentences using synonym matching.
# Synonym matching code from nickloewen (https://github.com/nickloewen/thesaurus)
# Can either run script and pass arguments in Terminal / Command Prompt, or imported as library.

import sys
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer
import re

class TextRegenerator(object):

    words_to_preserve = []
    words_to_preserve_default = ["a", "the", "in", "of", "at", "does"]
    words_to_preserve.extend([w.title() for w in words_to_preserve])
    punctuation = [".", ",", ":", ";", "?", "!"]


    def __init__(self):

        self.str_base = ""
        self.new_str = ""
        self.new_str_list = []


    def createWordSynonyms(self, word):
        synsets = wordnet.synsets(word)
        synonyms = [word]

        if word not in TextRegenerator.words_to_preserve:
            for s in synsets:
                for l in s.lemmas():
                    synonyms.append(l.name())

        # if there are no synonyms, put the original word in
        synonyms.append(word)
        return self.uniq(synonyms)


    def createPhraseSynonyms(self, _str_base):
        """Finds synonyms for every word in the input. Returns a list, containing a
        list of synonyms for every word in the input."""

        tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
        tokens = tokenizer.tokenize(_str_base)

        # synonyms for all words: each word is a list of synonyms inside this one
        synonyms = []
        for t in tokens:
            synonyms.append(self.createWordSynonyms(t))
        return synonyms


    def stripUnderscores(self, word):
        return re.sub("_", " ", word)

    def tidyPunctuation(self, word):
        return re.sub(r'\s([?.!"](?:\s|$))', r'\1', word)

    def uniq(self, seq):
        seen = set()
        seen_add = seen.add
        return [ x for x in seq if x not in seen and not seen_add(x)]


    def permuteAllSynonyms(self, phrase_synonyms):

        output = []

        """Determine which token has the most phrase_synonyms."""
        longest = ""
        for item in phrase_synonyms:
            if len(item) > len(longest):
                longest = item

        # Loop for each synonym in 'longest' list.
        for i in range(len(longest)):
            """Build a new phrase using the first word of each list, then remove
               that word, unless it is the last one."""

            phrase = ""
            for s in phrase_synonyms:
                phrase = phrase + " " + str(s[0])
                if len(s) > 1:
                    s.pop(0)
            output.append(phrase.strip())

        return output


    def generateStrVariations(self, _str_base):

        """Generates variations (through synonym matching) of an inputted string, ignoring
           list of stop words."""

        print('\n\tNow generating variations of: "' + _str_base + '"..')

        #   Use the code block below jf you want to make a list of variations using synonym matching (many of the variations don't make sense)
        output = self.createPhraseSynonyms(_str_base)
        output = self.permuteAllSynonyms(output)
        for phrase in output:
            print "\t\t" + str(self.tidyPunctuation(self.stripUnderscores(phrase)))
        print"> > >\n"
        return output


    def addStopWords(self, l_param):

        """Takes in a 'string' (enclosed in quotes, and meant to be typed as a list of words separated
           by commas and spaces) to parse and append to the stop (ignored) words."""
        try:
            l_param = l_param.lower().split(', ')
        except:
            pass
        # print("\n\t" + "Default list of stop words: " + str(TextRegenerator.words_to_preserve_default))
        TextRegenerator.words_to_preserve = list(l_param)
        TextRegenerator.words_to_preserve.extend(x for x in TextRegenerator.words_to_preserve_default if x not in TextRegenerator.words_to_preserve)
        print("\n< < <\n\t")
        print("Final list of stop words: " + str(TextRegenerator.words_to_preserve))


if __name__ == '__main__':
    try:
        l_param = str(sys.argv[2])
    except:
        l_param = []
        pass
    try:
        str_base = str(sys.argv[1]).lower()
    except IndexError as err:
        print("\n\n\tError initializing TextRegenerator: {0}".format(err))
        print("\n\tPossible invalid or no parameter supplied.\n\n\tRun the script again, with either the complete string"
        "you want to generate mass variations of as the only argument, or the second argument of a list of words to ignore (not" +
        "replace), separated by commas and spaces and enclosed in quotes.")
        print("\n\tExiting..")
        sys.exit(0)
    except:
        print("\n\n\tUnexpected error:", sys.exc_info()[0])
        print("\n\tExiting..")
        sys.exit(0)
    trgnr = TextRegenerator()
    trgnr.addStopWords(l_param)
    trgnr.generateStrVariations(str_base)
else:
    trgnr = TextRegenerator()
    l_param = []
    trgnr.addStopWords(l_param)