Twitter scraper Initial commit

source-nerd · source-nerd · commit e88a04fc244e · 2019-01-21T01:29:01.000Z
diff --git a/README.md b/README.md
@@ -0,0 +1,53 @@
+# Twitter Scraper
+![License: CC BY-ND 4.0](https://img.shields.io/badge/License-CC%20BY--ND%204.0-lightgrey.svg)
+
+A simple Python based Twitter Scraper with the ability of scripng tweets either by username or by a search query (Supports other search params as well). This program supports `STOP / RESUME` operation as it keeps a log of all the previous position id's. 
+This project was created inorder to bypass twitter's 7 day policy since it doesn't allow to fetch tweets which are more than 7 days old as I needed some data for my research project. 
+**Please Note: This is not an alternative for the official API's provided by the twitter**
+
+This project is intented for students, researchers & all those who abide by twitter's data terms and conditions.
+
+## Contents
+1. scraper.py
+2. searchParams.py
+3. tweets.py
+4. main.py
+5. requirements.txt
+
+## Usage
+1. `Scraper.py` contains all the essential code required to grab tweets and store in the csv file
+2. `searchParams.py` is a class for initializing seach parameters
+3. `tweets.py` is a class whose object for each tweet
+4. `main.py` - The main class required for calling all the above code. This file takes multiple arguments and is responsible for initializing all other files.
+**HELP**
+`python main.py --help`
+ 
+## Prerequisites & Installation Instructions
+This Project is intented to be used with Python `3.x` but feel free to convert it inorder to use it for Python `2.x`.
+A **requirements.txt** file is provided with the project, which contains all the essential packages to run this project.
+```
+pip install -r requirements.txt
+```
+
+## Running the code
+Use `main.py` to run the code
+For help: -> `python main.py --help`
+
+Example:
+Search for **github** keyword between **2018-06-15 to 2018-06-20** and save it to **test.csv** with log file as **test.log**.
+```
+python main.py --searchquery github --since 2018-06-15 --until 2018-06-20 --op test.csv --log test.log
+```
+
+## Output
+The output of the scraper is saved in the output file provided in the parameters. By default the outpfile file is `op.csv`.
+The program also keeps a log of all the previous search positions, and writes it to the logger file provided in the params. By default, the log file is `def_log.log`. This file is required inorder to resume the scraping operation if interrupted in between.
+**Note: If you want to `RESUME` your previous incomplete scrape operation, make sure to provide the same log file as you did in the first instance.**
+
+## Feedback & Final Thoughts
+Again, this project is intended for Education use. Feel free to use it. You may face cookies problem wherein running your code for the first time will work perfectly fine, but every other time it will fail. So, inorder to fix this, try to use `PROXY`. 
+`--proxy` parameter can be used to pass proxy ip and port.
+E.G: `0.0.0.0:80`
+There are lots of free proxy sites out there that you can use.
+
+The code may not be very optimized, so if you tend to find any bug fixes, feature requests, pull requests, feedback, etc., are welcome... If you like this project, please do give it a star.
diff --git a/main.py b/main.py
@@ -0,0 +1,59 @@
+import logging
+import os
+import sys
+import click
+from scraper import parse_json
+from searchParams import SearchParams
+
+
+@click.command()
+@click.option('--searchquery', default=None, help='Query to be searched on twitter')
+@click.option('--username', default=None, help='User to search for')
+@click.option('--since', default=None, help='Start date in the format yyyy-mm-dd (e.g: 2017-08-25)')
+@click.option('--until', default=None, help='End date in the format yyyy-mm-dd (e.g 2019-01-20)')
+@click.option('--language', default='en', help='Tweet language to search for')
+@click.option('--maxcount', default=0, help='Max number of tweets you want to grab')
+@click.option('--proxy', default=None, help='Proxy ip to use')
+@click.option('--op', default='op.csv', help='Output file to save the tweets (default: op.csv)')
+@click.option('--log', default='def_log.log', help='Log file name to log search index (default: def_log.log)')
+def arg_parser(searchquery, username, since, until, language, maxcount, proxy, op, log):
+    """
+    Python based Twitter Scraper. \n
+    Provide search parameters when running this script. \n
+    Example: python main.py --searchquery notpetya --since 2017-06-07 --until 2017-07-15 --op notpetya.csv --log test.log
+    """
+    search_parameters = SearchParams()
+    search_parameters.set_search_query(searchquery)
+    search_parameters.set_user_name(username)
+    search_parameters.set_since_date(since)
+    search_parameters.set_until_date(until)
+    search_parameters.set_language(language)
+    search_parameters.set_max_retrieval_count(maxcount)
+    search_parameters.set_proxy(proxy)
+    search_parameters.set_op(op)
+    search_parameters.set_log_file_name(log)
+
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
+                        datefmt='%m-%d %H:%M',
+                        filename=search_parameters.log_file_name,
+                        filemode='a+')
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
+    console.setFormatter(formatter)
+    logging.getLogger('').addHandler(console)
+    search_parameters.set_logger(logging)
+
+    with open(search_parameters.op, 'a+') as f:
+        if os.stat(search_parameters.op).st_size == 0:
+            f.write('uuid;tweet_id;user_name;screen_name;tweet;date_time;retweet_count;fav_count;link\n')
+
+    parse_json(search_parameters)
+
+
+if __name__ == "__main__":
+    if sys.version_info[0] < 3:
+        print('Python 3 not found. Please install Python 3.x and try again')
+    else:
+        arg_parser(sys.argv[1:])
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,18 @@
+beautifulsoup4==4.7.1
+bs4==0.0.1
+certifi==2018.11.29
+cfscrape==1.9.5
+chardet==3.0.4
+Click==7.0
+cssselect==1.0.3
+idna==2.8
+lxml==4.3.0
+numpy==1.15.4
+pandas==0.23.4
+pyquery==1.4.0
+python-dateutil==2.7.5
+pytz==2018.9
+requests==2.21.0
+six==1.12.0
+soupsieve==1.7
+urllib3==1.24.1
diff --git a/scraper.py b/scraper.py
@@ -0,0 +1,137 @@
+import datetime
+import http.cookiejar
+import json
+import re
+import uuid
+import urllib.parse, urllib.request, urllib.error
+from pyquery import PyQuery
+from tweets import Tweet
+
+
+def get_tweets(search_params, current_position):
+    """
+    Build search Query and get the tweets
+    :param search_params: SearchParams object
+    :param current_position: Min position where you want to retrieve the tweets from
+    :return: twitter json_data
+    """
+    base_url = "https://twitter.com/i/search/timeline?f=tweets&q={}&src=typd&{}max_position={}"
+    query = ''
+    query = query + (' ' + search_params.search_query) if search_params.search_query else query
+    query = query + (' from:' + search_params.account_name) if search_params.account_name else query
+    query = query + (' since:' + search_params.since_date) if search_params.since_date else query
+    query = query + (' until:' + search_params.until_date) if search_params.until_date else query
+    lang = ('lang=' + search_params.language + '&') if search_params.language else ''
+
+    query = urllib.parse.quote(query)
+    base_url = base_url.format(query, lang, current_position)
+    print(base_url)
+
+    cookie_jar = http.cookiejar.CookieJar()
+    headers = [
+        ('Host', "twitter.com"),
+        ('User-Agent', "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"),
+        ('Accept', "application/json, text/javascript, */*; q=0.01"),
+        ('Accept-Language', "en-US;q=0.7,en;q=0.3"),
+        ('X-Requested-With', "XMLHttpRequest"),
+        ('Referer', base_url),
+        ('Connection', "keep-alive")
+    ]
+
+    attempts = 0
+    response = ''
+    while attempts < 10:
+        try:
+            if search_params.proxy:
+                print('Using IP {}'.format(search_params.proxy))
+                proxy = urllib.request.ProxyHandler({'http': search_params.proxy, 'https': search_params.proxy})
+                opener = urllib.request.build_opener(proxy, urllib.request.HTTPCookieProcessor(cookie_jar))
+            else:
+                opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie_jar))
+            opener.addheaders = headers
+            response = opener.open(base_url)
+            break
+        except Exception:
+            attempts += 1
+            print('Retrying with different IP !!')
+
+    json_res = response.read()
+    json_data = json.loads(json_res.decode())
+    return json_data
+
+
+def parse_json(search_params):
+    """
+    Parse the json tweet
+    :param search_params: SearchParams object
+    :return: void
+    """
+    min_position = get_last_search_position(search_params.log_file_name)
+    count = 0
+    while True:
+        json_res = get_tweets(search_params, min_position)
+        if len(json_res['items_html'].strip()) == 0:
+            break
+
+        min_position = json_res['min_position']
+        search_params.logging.info('min_pos - {}'.format(min_position))
+        item = json_res['items_html']
+        scraped_tweets = PyQuery(item)
+        scraped_tweets.remove('div.withheld-tweet')
+        tweets = scraped_tweets('div.js-stream-tweet')
+
+        for tweet_html in tweets:
+            print(count)
+            tweet_py_query = PyQuery(tweet_html)
+            name = tweet_py_query.attr("data-name")
+            screen_name = tweet_py_query.attr("data-screen-name")
+            tweet_id = tweet_py_query.attr("data-tweet-id")
+            tweet_text = re.sub(r"\s+", " ",
+                                tweet_py_query("p.js-tweet-text").text().replace('# ', '#').replace('@ ', '@'))
+            tweet_date_time = int(tweet_py_query("small.time span.js-short-timestamp").attr("data-time"))
+            tweet_date_time = datetime.datetime.fromtimestamp(tweet_date_time)
+            retweet_count = int(tweet_py_query("span.ProfileTweet-action--retweet span.ProfileTweet-actionCount").attr(
+                "data-tweet-stat-count").replace(",", ""))
+            favorites_count = int(
+                tweet_py_query("span.ProfileTweet-action--favorite span.ProfileTweet-actionCount").attr(
+                    "data-tweet-stat-count").replace(",", ""))
+            permalink = 'https://twitter.com' + tweet_py_query.attr("data-permalink-path")
+
+            tweet = Tweet(str(uuid.uuid4()), name, screen_name, tweet_id, tweet_text, tweet_date_time, retweet_count,
+                          favorites_count, permalink)
+            # Now Write to OP or save to DB
+            write_op(search_params.op, tweet)
+            count += 1
+        # sleep(5)
+        if 0 < search_params.max_retrieval_count <= count:
+            break
+
+
+def write_op(op_file, tweet):
+    """
+    Writing tweets to some output file
+    :param op_file: op_file name
+    :param tweet: Tweet object
+    :return: void
+    """
+    with open(op_file, 'a+', encoding='utf-8') as f:
+        # UUID, tweet_id, user_name, screen_name, tweet, date_time, retweet_count, fav_count, link
+        f.write(
+            ('%s;%s;%s;%s;%s;%s;%d;%d;%s\n' % (tweet.uuid, tweet.tweet_id, tweet.name, tweet.screen_name, tweet.tweet,
+                                               tweet.date_time.strftime("%Y-%m-%d %H:%M"), tweet.retweet_count,
+                                               tweet.favourites_count, tweet.link)))
+
+
+def get_last_search_position(logger_file):
+    """
+    Required for resuming the previous search operation
+    :param logger_file: Logger file name
+    :return: Last position id
+    """
+    with open(logger_file, 'r+') as f:
+        lines = f.read().splitlines()
+        try:
+            last_pos = lines[-1].split(' - ')[1]
+        except IndexError:
+            last_pos = ''
+        return last_pos
diff --git a/searchParams.py b/searchParams.py
@@ -0,0 +1,43 @@
+class SearchParams:
+    def __init__(self):
+        self.max_retrieval_count = 0
+        self.search_query = None
+        self.account_name = None
+        self.since_date = None
+        self.until_date = None
+        self.language = None
+        self.proxy = None
+        self.op = None
+        self.logging = None
+        self.log_file_name = None
+        # Get log file name from logger:     print(logging.root.handlers[0].baseFilename)
+
+    def set_max_retrieval_count(self, max_retrieval_count):
+        self.max_retrieval_count = max_retrieval_count
+
+    def set_search_query(self, search_query):
+        self.search_query = search_query
+
+    def set_user_name(self, account_name):
+        self.account_name = account_name
+
+    def set_since_date(self, since_date):
+        self.since_date = since_date
+
+    def set_until_date(self, until_date):
+        self.until_date = until_date
+
+    def set_language(self, language):
+        self.language = language
+
+    def set_proxy(self, proxy):
+        self.proxy = proxy
+
+    def set_op(self, op):
+        self.op = op
+
+    def set_log_file_name(self, log_file_name):
+        self.log_file_name = log_file_name
+
+    def set_logger(self, logger):
+        self.logging = logger
diff --git a/tweets.py b/tweets.py
@@ -0,0 +1,11 @@
+class Tweet:
+    def __init__(self, uuid, name, screen_name, tweet_id, tweet, date_time, retweet_count, favourites_count, link):
+        self.uuid = uuid
+        self.name = name
+        self.screen_name = screen_name
+        self.tweet_id = tweet_id
+        self.tweet = tweet
+        self.date_time = date_time
+        self.retweet_count = retweet_count
+        self.favourites_count = favourites_count
+        self.link = link