diff --git a/Exporter.py b/Exporter.py index c1872855..84304d41 100644 --- a/Exporter.py +++ b/Exporter.py @@ -1,79 +1,96 @@ # -*- coding: utf-8 -*- -import sys,getopt,datetime,codecs +import sys +import getopt +import datetime +import codecs if sys.version_info[0] < 3: import got else: import got3 as got + def main(argv): - if len(argv) == 0: - print('You must pass some parameters. Use \"-h\" to help.') - return + if len(argv) == 0: + print('You must pass some parameters. Use \"-h\" to help.') + return + + if len(argv) == 1 and argv[0] == '-h': + f = open('exporter_help_text.txt', 'r') + print f.read() + f.close() + + return + + try: + opts, args = getopt.getopt(argv, "", ("username=", "near=", "within=", "since=", + "until=", "querysearch=", "toptweets", "maxtweets=", "output=")) + + tweetCriteria = got.manager.TweetCriteria() + outputFileName = "output_got.csv" - if len(argv) == 1 and argv[0] == '-h': - f = open('exporter_help_text.txt', 'r') - print f.read() - f.close() + for opt, arg in opts: + if opt == '--username': + tweetCriteria.username = arg - return + elif opt == '--since': + tweetCriteria.since = arg - try: - opts, args = getopt.getopt(argv, "", ("username=", "near=", "within=", "since=", "until=", "querysearch=", "toptweets", "maxtweets=", "output=")) + elif opt == '--until': + tweetCriteria.until = arg - tweetCriteria = got.manager.TweetCriteria() - outputFileName = "output_got.csv" + elif opt == '--querysearch': + tweetCriteria.querySearch = arg - for opt,arg in opts: - if opt == '--username': - tweetCriteria.username = arg + elif opt == '--toptweets': + tweetCriteria.topTweets = True - elif opt == '--since': - tweetCriteria.since = arg + elif opt == '--maxtweets': + tweetCriteria.maxTweets = int(arg) - elif opt == '--until': - tweetCriteria.until = arg + elif opt == '--near': + tweetCriteria.near = '"' + arg + '"' - elif opt == '--querysearch': - tweetCriteria.querySearch = arg + elif opt == '--within': + tweetCriteria.within = '"' + arg + '"' - elif opt == '--toptweets': - tweetCriteria.topTweets = True + elif opt == '--within': + tweetCriteria.within = '"' + arg + '"' - elif opt == '--maxtweets': - tweetCriteria.maxTweets = int(arg) - - elif opt == '--near': - tweetCriteria.near = '"' + arg + '"' - - elif opt == '--within': - tweetCriteria.within = '"' + arg + '"' + elif opt == '--output': + outputFileName = arg - elif opt == '--within': - tweetCriteria.within = '"' + arg + '"' + outputFile = codecs.open(outputFileName, "w+", "utf-8") - elif opt == '--output': - outputFileName = arg - - outputFile = codecs.open(outputFileName, "w+", "utf-8") + outputFile.write('username;date;retweets;favorites;text;geo;mentions;hashtags;id;permalink') - outputFile.write('username;date;retweets;favorites;text;geo;mentions;hashtags;id;permalink') + print('Searching...\n') - print('Searching...\n') + def receiveBuffer(tweets): + for t in tweets: + outputFile.write( + ('\n%s;%s;%d;%d;"%s";%s;%s;%s;"%s";%s' % + (t.username, + t.date.strftime("%Y-%m-%d %H:%M"), + t.retweets, + t.favorites, + t.text, + t.geo, + t.mentions, + t.hashtags, + t.id, + t.permalink))) + outputFile.flush() + print('More %d saved on file...\n' % len(tweets)) - def receiveBuffer(tweets): - for t in tweets: - outputFile.write(('\n%s;%s;%d;%d;"%s";%s;%s;%s;"%s";%s' % (t.username, t.date.strftime("%Y-%m-%d %H:%M"), t.retweets, t.favorites, t.text, t.geo, t.mentions, t.hashtags, t.id, t.permalink))) - outputFile.flush(); - print('More %d saved on file...\n' % len(tweets)) + got.manager.TweetManager.getTweets(tweetCriteria, receiveBuffer) - got.manager.TweetManager.getTweets(tweetCriteria, receiveBuffer) + except arg: + print('Arguments parser error, try -h' + arg) + finally: + outputFile.close() + print('Done. Output file generated "%s".' % outputFileName) - except arg: - print('Arguments parser error, try -h' + arg) - finally: - outputFile.close() - print('Done. Output file generated "%s".' % outputFileName) if __name__ == '__main__': - main(sys.argv[1:]) + main(sys.argv[1:]) diff --git a/Main.py b/Main.py index 75496a5f..a9fb5b09 100644 --- a/Main.py +++ b/Main.py @@ -4,33 +4,39 @@ else: import got3 as got + def main(): - def printTweet(descr, t): - print(descr) - print("Username: %s" % t.username) - print("Retweets: %d" % t.retweets) - print("Text: %s" % t.text) - print("Mentions: %s" % t.mentions) - print("Hashtags: %s\n" % t.hashtags) + def printTweet(descr, t): + print(descr) + print("Username: %s" % t.username) + print("Retweets: %d" % t.retweets) + print("Text: %s" % t.text) + print("Mentions: %s" % t.mentions) + print("Hashtags: %s\n" % t.hashtags) + + # Example 1 - Get tweets by username + tweetCriteria = got.manager.TweetCriteria().setUsername('barackobama').setMaxTweets(1) + tweet = got.manager.TweetManager.getTweets(tweetCriteria)[0] - # Example 1 - Get tweets by username - tweetCriteria = got.manager.TweetCriteria().setUsername('barackobama').setMaxTweets(1) - tweet = got.manager.TweetManager.getTweets(tweetCriteria)[0] + printTweet("### Example 1 - Get tweets by username [barackobama]", tweet) - printTweet("### Example 1 - Get tweets by username [barackobama]", tweet) + # Example 2 - Get tweets by query search + tweetCriteria = got.manager.TweetCriteria().setQuerySearch( + 'europe refugees').setSince("2015-05-01").setUntil("2015-09-30").setMaxTweets(1) + tweet = got.manager.TweetManager.getTweets(tweetCriteria)[0] - # Example 2 - Get tweets by query search - tweetCriteria = got.manager.TweetCriteria().setQuerySearch('europe refugees').setSince("2015-05-01").setUntil("2015-09-30").setMaxTweets(1) - tweet = got.manager.TweetManager.getTweets(tweetCriteria)[0] + printTweet("### Example 2 - Get tweets by query search [europe refugees]", tweet) - printTweet("### Example 2 - Get tweets by query search [europe refugees]", tweet) + # Example 3 - Get tweets by username and bound dates + tweetCriteria = got.manager.TweetCriteria().setUsername( + "barackobama").setSince("2015-09-10").setUntil("2015-09-12").setMaxTweets(1) + tweet = got.manager.TweetManager.getTweets(tweetCriteria)[0] - # Example 3 - Get tweets by username and bound dates - tweetCriteria = got.manager.TweetCriteria().setUsername("barackobama").setSince("2015-09-10").setUntil("2015-09-12").setMaxTweets(1) - tweet = got.manager.TweetManager.getTweets(tweetCriteria)[0] + printTweet( + "### Example 3 - Get tweets by username and bound dates [barackobama, '2015-09-10', '2015-09-12']", + tweet) - printTweet("### Example 3 - Get tweets by username and bound dates [barackobama, '2015-09-10', '2015-09-12']", tweet) if __name__ == '__main__': - main() + main() diff --git a/got/__init__.py b/got/__init__.py index 34022945..d600f949 100644 --- a/got/__init__.py +++ b/got/__init__.py @@ -1,2 +1,2 @@ import models -import manager \ No newline at end of file +import manager diff --git a/got/manager/TweetCriteria.py b/got/manager/TweetCriteria.py index 76649e28..c2752c95 100644 --- a/got/manager/TweetCriteria.py +++ b/got/manager/TweetCriteria.py @@ -1,37 +1,37 @@ class TweetCriteria: - - def __init__(self): - self.maxTweets = 0 - self.within = "15mi" - - def setUsername(self, username): - self.username = username - return self - - def setSince(self, since): - self.since = since - return self - - def setUntil(self, until): - self.until = until - return self - - def setQuerySearch(self, querySearch): - self.querySearch = querySearch - return self - - def setMaxTweets(self, maxTweets): - self.maxTweets = maxTweets - return self - - def setTopTweets(self, topTweets): - self.topTweets = topTweets - return self - - def setNear(self, near): - self.near = near - return self - - def setWithin(self, within): - self.within = within - return self + + def __init__(self): + self.maxTweets = 0 + self.within = "15mi" + + def setUsername(self, username): + self.username = username + return self + + def setSince(self, since): + self.since = since + return self + + def setUntil(self, until): + self.until = until + return self + + def setQuerySearch(self, querySearch): + self.querySearch = querySearch + return self + + def setMaxTweets(self, maxTweets): + self.maxTweets = maxTweets + return self + + def setTopTweets(self, topTweets): + self.topTweets = topTweets + return self + + def setNear(self, near): + self.near = near + return self + + def setWithin(self, within): + self.within = within + return self diff --git a/got/manager/TweetManager.py b/got/manager/TweetManager.py index 2eb4210f..1be16919 100644 --- a/got/manager/TweetManager.py +++ b/got/manager/TweetManager.py @@ -1,135 +1,144 @@ -import urllib,urllib2,json,re,datetime,sys,cookielib +import urllib +import urllib2 +import json +import re +import datetime +import sys +import cookielib from .. import models from pyquery import PyQuery + class TweetManager: - - def __init__(self): - pass - - @staticmethod - def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None): - refreshCursor = '' - - results = [] - resultsAux = [] - cookieJar = cookielib.CookieJar() - - if hasattr(tweetCriteria, 'username') and (tweetCriteria.username.startswith("\'") or tweetCriteria.username.startswith("\"")) and (tweetCriteria.username.endswith("\'") or tweetCriteria.username.endswith("\"")): - tweetCriteria.username = tweetCriteria.username[1:-1] - - active = True - - while active: - json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy) - if len(json['items_html'].strip()) == 0: - break - - refreshCursor = json['min_position'] - tweets = PyQuery(json['items_html'])('div.js-stream-tweet') - - if len(tweets) == 0: - break - - for tweetHTML in tweets: - tweetPQ = PyQuery(tweetHTML) - tweet = models.Tweet() - - usernameTweet = tweetPQ("span:first.username.u-dir b").text(); - txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@')); - retweets = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")); - favorites = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")); - dateSec = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time")); - id = tweetPQ.attr("data-tweet-id"); - permalink = tweetPQ.attr("data-permalink-path"); - - geo = '' - geoSpan = tweetPQ('span.Tweet-geo') - if len(geoSpan) > 0: - geo = geoSpan.attr('title') - - tweet.id = id - tweet.permalink = 'https://twitter.com' + permalink - tweet.username = usernameTweet - tweet.text = txt - tweet.date = datetime.datetime.fromtimestamp(dateSec) - tweet.retweets = retweets - tweet.favorites = favorites - tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text)) - tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text)) - tweet.geo = geo - - results.append(tweet) - resultsAux.append(tweet) - - if receiveBuffer and len(resultsAux) >= bufferLength: - receiveBuffer(resultsAux) - resultsAux = [] - - if tweetCriteria.maxTweets > 0 and len(results) >= tweetCriteria.maxTweets: - active = False - break - - - if receiveBuffer and len(resultsAux) > 0: - receiveBuffer(resultsAux) - - return results - - @staticmethod - def getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy): - url = "https://twitter.com/i/search/timeline?f=tweets&q=%s&src=typd&max_position=%s" - - urlGetData = '' - - if hasattr(tweetCriteria, 'username'): - urlGetData += ' from:' + tweetCriteria.username - - if hasattr(tweetCriteria, 'querySearch'): - urlGetData += ' ' + tweetCriteria.querySearch - - if hasattr(tweetCriteria, 'near'): - urlGetData += "&near:" + tweetCriteria.near + " within:" + tweetCriteria.within - - if hasattr(tweetCriteria, 'since'): - urlGetData += ' since:' + tweetCriteria.since - - if hasattr(tweetCriteria, 'until'): - urlGetData += ' until:' + tweetCriteria.until - - - if hasattr(tweetCriteria, 'topTweets'): - if tweetCriteria.topTweets: - url = "https://twitter.com/i/search/timeline?q=%s&src=typd&max_position=%s" - - - - url = url % (urllib.quote(urlGetData), refreshCursor) - - headers = [ - ('Host', "twitter.com"), - ('User-Agent', "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"), - ('Accept', "application/json, text/javascript, */*; q=0.01"), - ('Accept-Language', "de,en-US;q=0.7,en;q=0.3"), - ('X-Requested-With', "XMLHttpRequest"), - ('Referer', url), - ('Connection', "keep-alive") - ] - - if proxy: - opener = urllib2.build_opener(urllib2.ProxyHandler({'http': proxy, 'https': proxy}), urllib2.HTTPCookieProcessor(cookieJar)) - else: - opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) - opener.addheaders = headers - - try: - response = opener.open(url) - jsonResponse = response.read() - except: - print "Twitter weird response. Try to see on browser: https://twitter.com/search?q=%s&src=typd" % urllib.quote(urlGetData) - sys.exit() - return - - dataJson = json.loads(jsonResponse) - - return dataJson + + def __init__(self): + pass + + @staticmethod + def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None): + refreshCursor = '' + + results = [] + resultsAux = [] + cookieJar = cookielib.CookieJar() + + if hasattr( + tweetCriteria, 'username') and ( + tweetCriteria.username.startswith("\'") or tweetCriteria.username.startswith("\"")) and ( + tweetCriteria.username.endswith("\'") or tweetCriteria.username.endswith("\"")): + tweetCriteria.username = tweetCriteria.username[1:-1] + + active = True + + while active: + json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy) + if len(json['items_html'].strip()) == 0: + break + + refreshCursor = json['min_position'] + tweets = PyQuery(json['items_html'])('div.js-stream-tweet') + + if len(tweets) == 0: + break + + for tweetHTML in tweets: + tweetPQ = PyQuery(tweetHTML) + tweet = models.Tweet() + + usernameTweet = tweetPQ("span:first.username.u-dir b").text(); + txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@')) + retweets = int(tweetPQ( + "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) + favorites = int(tweetPQ( + "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) + dateSec = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time")) + id = tweetPQ.attr("data-tweet-id") + permalink = tweetPQ.attr("data-permalink-path") + + geo = '' + geoSpan = tweetPQ('span.Tweet-geo') + if len(geoSpan) > 0: + geo = geoSpan.attr('title') + + tweet.id = id + tweet.permalink = 'https://twitter.com' + permalink + tweet.username = usernameTweet + tweet.text = txt + tweet.date = datetime.datetime.fromtimestamp(dateSec) + tweet.retweets = retweets + tweet.favorites = favorites + tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text)) + tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text)) + tweet.geo = geo + + results.append(tweet) + resultsAux.append(tweet) + + if receiveBuffer and len(resultsAux) >= bufferLength: + receiveBuffer(resultsAux) + resultsAux = [] + + if tweetCriteria.maxTweets > 0 and len(results) >= tweetCriteria.maxTweets: + active = False + break + + if receiveBuffer and len(resultsAux) > 0: + receiveBuffer(resultsAux) + + return results + + @staticmethod + def getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy): + url = "https://twitter.com/i/search/timeline?f=tweets&q=%s&src=typd&max_position=%s" + + urlGetData = '' + + if hasattr(tweetCriteria, 'username'): + urlGetData += ' from:' + tweetCriteria.username + + if hasattr(tweetCriteria, 'querySearch'): + urlGetData += ' ' + tweetCriteria.querySearch + + if hasattr(tweetCriteria, 'near'): + urlGetData += "&near:" + tweetCriteria.near + " within:" + tweetCriteria.within + + if hasattr(tweetCriteria, 'since'): + urlGetData += ' since:' + tweetCriteria.since + + if hasattr(tweetCriteria, 'until'): + urlGetData += ' until:' + tweetCriteria.until + + if hasattr(tweetCriteria, 'topTweets'): + if tweetCriteria.topTweets: + url = "https://twitter.com/i/search/timeline?q=%s&src=typd&max_position=%s" + + url = url % (urllib.quote(urlGetData), refreshCursor) + + headers = [ + ('Host', "twitter.com"), + ('User-Agent', "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"), + ('Accept', "application/json, text/javascript, */*; q=0.01"), + ('Accept-Language', "de,en-US;q=0.7,en;q=0.3"), + ('X-Requested-With', "XMLHttpRequest"), + ('Referer', url), + ('Connection', "keep-alive") + ] + + if proxy: + opener = urllib2.build_opener(urllib2.ProxyHandler( + {'http': proxy, 'https': proxy}), urllib2.HTTPCookieProcessor(cookieJar)) + else: + opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) + opener.addheaders = headers + + try: + response = opener.open(url) + jsonResponse = response.read() + except BaseException: + print "Twitter weird response. Try to see on browser: https://twitter.com/search?q=%s&src=typd" % urllib.quote(urlGetData) + sys.exit() + return + + dataJson = json.loads(jsonResponse) + + return dataJson diff --git a/got/manager/__init__.py b/got/manager/__init__.py index 2ebdffda..29f0895c 100644 --- a/got/manager/__init__.py +++ b/got/manager/__init__.py @@ -1,2 +1,2 @@ from TweetCriteria import TweetCriteria -from TweetManager import TweetManager \ No newline at end of file +from TweetManager import TweetManager diff --git a/got/models/Tweet.py b/got/models/Tweet.py index c7904e9b..f1c096de 100644 --- a/got/models/Tweet.py +++ b/got/models/Tweet.py @@ -1,4 +1,4 @@ class Tweet: - - def __init__(self): - pass \ No newline at end of file + + def __init__(self): + pass diff --git a/got/models/__init__.py b/got/models/__init__.py index 7791d535..521dc00b 100644 --- a/got/models/__init__.py +++ b/got/models/__init__.py @@ -1 +1 @@ -from Tweet import Tweet \ No newline at end of file +from Tweet import Tweet diff --git a/got3/__init__.py b/got3/__init__.py index 0913bac0..655fff4d 100644 --- a/got3/__init__.py +++ b/got3/__init__.py @@ -1,2 +1,2 @@ from . import models -from . import manager \ No newline at end of file +from . import manager diff --git a/got3/manager/TweetCriteria.py b/got3/manager/TweetCriteria.py index 695462b6..55e66970 100644 --- a/got3/manager/TweetCriteria.py +++ b/got3/manager/TweetCriteria.py @@ -1,32 +1,32 @@ class TweetCriteria: - def __init__(self): - self.maxTweets = 0 + def __init__(self): + self.maxTweets = 0 - def setUsername(self, username): - self.username = username - return self + def setUsername(self, username): + self.username = username + return self - def setSince(self, since): - self.since = since - return self + def setSince(self, since): + self.since = since + return self - def setUntil(self, until): - self.until = until - return self + def setUntil(self, until): + self.until = until + return self - def setQuerySearch(self, querySearch): - self.querySearch = querySearch - return self + def setQuerySearch(self, querySearch): + self.querySearch = querySearch + return self - def setMaxTweets(self, maxTweets): - self.maxTweets = maxTweets - return self + def setMaxTweets(self, maxTweets): + self.maxTweets = maxTweets + return self - def setLang(self, Lang): - self.lang = Lang - return self + def setLang(self, Lang): + self.lang = Lang + return self - def setTopTweets(self, topTweets): - self.topTweets = topTweets - return self \ No newline at end of file + def setTopTweets(self, topTweets): + self.topTweets = topTweets + return self diff --git a/got3/manager/TweetManager.py b/got3/manager/TweetManager.py index 5c627749..c0d9cab8 100644 --- a/got3/manager/TweetManager.py +++ b/got3/manager/TweetManager.py @@ -1,138 +1,153 @@ -import urllib.request, urllib.parse, urllib.error,urllib.request,urllib.error,urllib.parse,json,re,datetime,sys,http.cookiejar +import urllib.request +import urllib.parse +import urllib.error +import urllib.request +import urllib.error +import urllib.parse +import json +import re +import datetime +import sys +import http.cookiejar from .. import models from pyquery import PyQuery + class TweetManager: - - def __init__(self): - pass - - @staticmethod - def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None): - refreshCursor = '' - - results = [] - resultsAux = [] - cookieJar = http.cookiejar.CookieJar() - - active = True - - while active: - json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy) - if len(json['items_html'].strip()) == 0: - break - - refreshCursor = json['min_position'] - tweets = PyQuery(json['items_html'])('div.js-stream-tweet') - - if len(tweets) == 0: - break - - for tweetHTML in tweets: - tweetPQ = PyQuery(tweetHTML) - tweet = models.Tweet() - - usernameTweet = tweetPQ("span.username.js-action-profile-name b").text(); - txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@')); - retweets = int(tweetPQ("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")); - favorites = int(tweetPQ("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")); - dateSec = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time")); - id = tweetPQ.attr("data-tweet-id"); - permalink = tweetPQ.attr("data-permalink-path"); - user_id = int(tweetPQ("a.js-user-profile-link").attr("data-user-id")) - - geo = '' - geoSpan = tweetPQ('span.Tweet-geo') - if len(geoSpan) > 0: - geo = geoSpan.attr('title') - urls = [] - for link in tweetPQ("a"): - try: - urls.append((link.attrib["data-expanded-url"])) - except KeyError: - pass - tweet.id = id - tweet.permalink = 'https://twitter.com' + permalink - tweet.username = usernameTweet - - tweet.text = txt - tweet.date = datetime.datetime.fromtimestamp(dateSec) - tweet.formatted_date = datetime.datetime.fromtimestamp(dateSec).strftime("%a %b %d %X +0000 %Y") - tweet.retweets = retweets - tweet.favorites = favorites - tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text)) - tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text)) - tweet.geo = geo - tweet.urls = ",".join(urls) - tweet.author_id = user_id - - results.append(tweet) - resultsAux.append(tweet) - - if receiveBuffer and len(resultsAux) >= bufferLength: - receiveBuffer(resultsAux) - resultsAux = [] - - if tweetCriteria.maxTweets > 0 and len(results) >= tweetCriteria.maxTweets: - active = False - break - - - if receiveBuffer and len(resultsAux) > 0: - receiveBuffer(resultsAux) - - return results - - @staticmethod - def getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy): - url = "https://twitter.com/i/search/timeline?f=tweets&q=%s&src=typd&%smax_position=%s" - - urlGetData = '' - if hasattr(tweetCriteria, 'username'): - urlGetData += ' from:' + tweetCriteria.username - - if hasattr(tweetCriteria, 'since'): - urlGetData += ' since:' + tweetCriteria.since - - if hasattr(tweetCriteria, 'until'): - urlGetData += ' until:' + tweetCriteria.until - - if hasattr(tweetCriteria, 'querySearch'): - urlGetData += ' ' + tweetCriteria.querySearch - - if hasattr(tweetCriteria, 'lang'): - urlLang = 'lang=' + tweetCriteria.lang + '&' - else: - urlLang = '' - url = url % (urllib.parse.quote(urlGetData), urlLang, refreshCursor) - #print(url) - - headers = [ - ('Host', "twitter.com"), - ('User-Agent', "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"), - ('Accept', "application/json, text/javascript, */*; q=0.01"), - ('Accept-Language', "de,en-US;q=0.7,en;q=0.3"), - ('X-Requested-With', "XMLHttpRequest"), - ('Referer', url), - ('Connection', "keep-alive") - ] - - if proxy: - opener = urllib2.build_opener(urllib2.ProxyHandler({'http': proxy, 'https': proxy}), urllib2.HTTPCookieProcessor(cookieJar)) - else: - opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) - opener.addheaders = headers - - try: - response = opener.open(url) - jsonResponse = response.read() - except: - #print("Twitter weird response. Try to see on browser: ", url) - print("Twitter weird response. Try to see on browser: https://twitter.com/search?q=%s&src=typd" % urllib.parse.quote(urlGetData)) - print("Unexpected error:", sys.exc_info()[0]) - sys.exit() - return - - dataJson = json.loads(jsonResponse.decode()) - - return dataJson \ No newline at end of file + + def __init__(self): + pass + + @staticmethod + def getTweets(tweetCriteria, receiveBuffer=None, bufferLength=100, proxy=None): + refreshCursor = '' + + results = [] + resultsAux = [] + cookieJar = http.cookiejar.CookieJar() + + active = True + + while active: + json = TweetManager.getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy) + if len(json['items_html'].strip()) == 0: + break + + refreshCursor = json['min_position'] + tweets = PyQuery(json['items_html'])('div.js-stream-tweet') + + if len(tweets) == 0: + break + + for tweetHTML in tweets: + tweetPQ = PyQuery(tweetHTML) + tweet = models.Tweet() + + usernameTweet = tweetPQ("span.username.js-action-profile-name b").text() + txt = re.sub(r"\s+", " ", tweetPQ("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@')) + retweets = int(tweetPQ( + "span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) + favorites = int(tweetPQ( + "span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr("data-tweet-stat-count").replace(",", "")) + dateSec = int(tweetPQ("small.time span.js-short-timestamp").attr("data-time")) + id = tweetPQ.attr("data-tweet-id") + permalink = tweetPQ.attr("data-permalink-path") + user_id = int(tweetPQ("a.js-user-profile-link").attr("data-user-id")) + + geo = '' + geoSpan = tweetPQ('span.Tweet-geo') + if len(geoSpan) > 0: + geo = geoSpan.attr('title') + urls = [] + for link in tweetPQ("a"): + try: + urls.append((link.attrib["data-expanded-url"])) + except KeyError: + pass + tweet.id = id + tweet.permalink = 'https://twitter.com' + permalink + tweet.username = usernameTweet + + tweet.text = txt + tweet.date = datetime.datetime.fromtimestamp(dateSec) + tweet.formatted_date = datetime.datetime.fromtimestamp(dateSec).strftime("%a %b %d %X +0000 %Y") + tweet.retweets = retweets + tweet.favorites = favorites + tweet.mentions = " ".join(re.compile('(@\\w*)').findall(tweet.text)) + tweet.hashtags = " ".join(re.compile('(#\\w*)').findall(tweet.text)) + tweet.geo = geo + tweet.urls = ",".join(urls) + tweet.author_id = user_id + + results.append(tweet) + resultsAux.append(tweet) + + if receiveBuffer and len(resultsAux) >= bufferLength: + receiveBuffer(resultsAux) + resultsAux = [] + + if tweetCriteria.maxTweets > 0 and len(results) >= tweetCriteria.maxTweets: + active = False + break + + if receiveBuffer and len(resultsAux) > 0: + receiveBuffer(resultsAux) + + return results + + @staticmethod + def getJsonReponse(tweetCriteria, refreshCursor, cookieJar, proxy): + url = "https://twitter.com/i/search/timeline?f=tweets&q=%s&src=typd&%smax_position=%s" + + urlGetData = '' + if hasattr(tweetCriteria, 'username'): + urlGetData += ' from:' + tweetCriteria.username + + if hasattr(tweetCriteria, 'since'): + urlGetData += ' since:' + tweetCriteria.since + + if hasattr(tweetCriteria, 'until'): + urlGetData += ' until:' + tweetCriteria.until + + if hasattr(tweetCriteria, 'querySearch'): + urlGetData += ' ' + tweetCriteria.querySearch + + if hasattr(tweetCriteria, 'lang'): + urlLang = 'lang=' + tweetCriteria.lang + '&' + else: + urlLang = '' + url = url % (urllib.parse.quote(urlGetData), urlLang, refreshCursor) + # print(url) + + headers = [ + ('Host', "twitter.com"), + ('User-Agent', "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"), + ('Accept', "application/json, text/javascript, */*; q=0.01"), + ('Accept-Language', "de,en-US;q=0.7,en;q=0.3"), + ('X-Requested-With', "XMLHttpRequest"), + ('Referer', url), + ('Connection', "keep-alive") + ] + + if proxy: + opener = urllib.request.build_opener(urllib.request.ProxyHandler( + {'http': proxy, 'https': proxy}), urllib.request.HTTPCookieProcessor(cookieJar)) + else: + opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookieJar)) + opener.addheaders = headers + + try: + response = opener.open(url) + jsonResponse = response.read() + except BaseException: + #print("Twitter weird response. Try to see on browser: ", url) + print( + "Twitter weird response. Try to see on browser: https://twitter.com/search?q=%s&src=typd" % + urllib.parse.quote(urlGetData)) + print("Unexpected error:", sys.exc_info()[0]) + sys.exit() + return + + dataJson = json.loads(jsonResponse.decode()) + + return dataJson diff --git a/got3/manager/__init__.py b/got3/manager/__init__.py index 95c7d792..6afe0fc5 100644 --- a/got3/manager/__init__.py +++ b/got3/manager/__init__.py @@ -1,2 +1,2 @@ from .TweetCriteria import TweetCriteria -from .TweetManager import TweetManager \ No newline at end of file +from .TweetManager import TweetManager diff --git a/got3/models/Tweet.py b/got3/models/Tweet.py index c7904e9b..f1c096de 100644 --- a/got3/models/Tweet.py +++ b/got3/models/Tweet.py @@ -1,4 +1,4 @@ class Tweet: - - def __init__(self): - pass \ No newline at end of file + + def __init__(self): + pass diff --git a/got3/models/__init__.py b/got3/models/__init__.py index c9db3d54..a700db0f 100644 --- a/got3/models/__init__.py +++ b/got3/models/__init__.py @@ -1 +1 @@ -from .Tweet import Tweet \ No newline at end of file +from .Tweet import Tweet