CarperAI · jbaicoianu · Nov 4, 2022 · Nov 4, 2022 · Nov 4, 2022 · Nov 4, 2022
diff --git a/codepile/tutorialspoint/tutorialspoint.py b/codepile/tutorialspoint/tutorialspoint.py
@@ -0,0 +1,82 @@
+from codepile.dataset import DatasetInfo, DatasetSources, RawDataset, Scraper, Processor, Analyser, Dataset
+from codepile.tutorialspoint.tutorialspoint_spider import TutorialspointArticleSpider
+from scrapy.crawler import CrawlerProcess
+import os, sys
+import pathlib
+from datetime import datetime
+
+class TutorialspointScraper(Scraper):
+    def __init__(self, tmppath, target_dir):
+        self.target_dir = target_dir
+        self.info = DatasetInfo(
+            id="Tutorialspoint Dataset",
+            description="Articles about various computing topics from TutorialsPoint.com",
+            size=3,
+            source_uri="https://tutorialspoint.com",
+            dataset_pros="",
+            dataset_cons="",
+            languages=["english"],
+            coding_languages=[],
+            modalities=[],
+            source_license="",
+            source_citation="TutorialsPoint.com",
+            data_owner="James Baicoianu",
+            contributers=["James Baicoianu"],
+            data_end=datetime(2022, 11, 3)
+       )
+
+    def download(self) -> RawDataset:
+
+        pathlib.Path(self.target_dir).mkdir(parents=True, exist_ok=True)
+        os.chdir(self.target_dir)
+
+        crawlsettings = {
+            "SCHEDULER_PRIORITY_QUEUE": 'scrapy.pqueues.DownloaderAwarePriorityQueue',
+            "CONCURRENT_REQUESTS": 1000,
+            "LOG_LEVEL": "WARN",
+            "DOWNLOAD_DELAY": .6,
+            "CONCURRENT_REQUESTS_PER_DOMAIN": 1,
+            "AUTOTHROTTLE_ENABLED": False,
+            "AUTOTHROTTLE_DEBUG": True,
+            "AUTOTHROTTLE_TARGET_CONCURRENCY": .5,
+            "REACTOR_THREADPOOL_MAXSIZE": 100,
+            'ITEM_PIPELINES': {
+                'codepile.tutorialspoint.tutorialspoint_spider.JsonS3WriterPipeline': 300,
+            }
+            #"JOBDIR": "scrapy-job",
+        }
+
+        # TODO - crawl type should be an argument we can pass in
+        crawltype = 'articles'
+
+        if crawltype == 'articles':
+            process = CrawlerProcess(crawlsettings)
+            process.crawl(TutorialspointArticleSpider)
+            process.start()
+
+        return RawDataset(storage_uris=['file:///{self.target_dir}'],
+                metadata='')
+
+
+#class TutorialspointProcessor(Processor):
+#    def process(self, raw_data: RawDataset, *args, **kwargs):
+#        # TODO - transform raw JSON data into whatever format we need for training
+#        return
+
+class TutorialspointDataset(Dataset):
+    def __init__(self, tempdir, target_dir):
+        self.scraper = TutorialspointScraper(tempdir, target_dir)
+        #self.processor = DiscourseCodeProcessor()
+    def info(self):
+        return self.info
+
+    def id(self):
+        return self.info.id
+    def download(self):
+        self.scraper.download()
+
+if __name__=="__main__":
+    if not os.path.exists("data/"):
+            os.makedirs("data/")
+    tutorialspoint_dataset = TutorialspointDataset('/tmp', sys.argv[1])
+    print(tutorialspoint_dataset.download())
diff --git a/codepile/tutorialspoint/tutorialspoint_spider.py b/codepile/tutorialspoint/tutorialspoint_spider.py
@@ -0,0 +1,184 @@
+import scrapy
+import json
+import pathlib
+import re
+import os
+import sys
+import html2text
+import zstd
+import boto3
+import botocore.exceptions
+from datetime import datetime
+
+S3_BUCKET = "s-eai-neox"
+S3_BUCKET_PATH = "data/codepile/tutorialsites/tutorialspoint/"
+
+s3client = boto3.client('s3')
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+
+class TutorialspointArticleSpider(scrapy.Spider):
+    name = "tutorialspoint"
+    headers = { }
+    user_agent = 'Mozilla/5.0 (compatible; Carper-GooseBot/8000; +https://carper.ai/)'
+
+    # Keep a list of crawled URLs which we can persist across runs
+    crawled = set()
+
+    def start_requests(self):
+        sitemaps = [
+                'https://www.tutorialspoint.com/tp.xml',
+                'https://www.tutorialspoint.com/tp1.xml',
+        ]
+
+        print('Loading crawled sites list...', flush=True)
+        if not os.path.isfile('crawled.txt'):
+            # Crawled URLs list not found, see if there's on in s3
+            try:
+                s3client.download_file(S3_BUCKET, S3_BUCKET_PATH + 'crawled.txt', 'crawled.txt')
+                print('got it', flush=True)
+            except:
+                print('couldnt fetch crawled.txt', flush=True)
+
+        # Load the list of crawled sites into a dict
+        try:
+            with open('crawled.txt', 'r') as crawled:
+                for url in crawled:
+                    #self.crawled[url.strip()] = True
+                    self.crawled.add(url.strip())
+        except:
+            print('couldnt open crawled.txt', flush=True)
+            pass
+
+
+        print('Fetching sitemaps...', flush=True);
+        for url in sitemaps:
+            yield scrapy.Request(url=url, callback=self.parse_sitemap)
+
+        # Uncomment to test parsing of a single article
+        #yield scrapy.Request(url="https://www.tutorialspoint.com/program-to-count-number-of-walls-required-to-partition-top-left-and-bottom-right-cells-in-python")
+
+    def parse_sitemap(self, response):
+        print('Processing sitemap:', response.url, flush=True)
+        response.selector.remove_namespaces()
+        print('running xpath selector', flush=True)
+        selectors = response.xpath('//loc//text()')
+        print('iterating', flush=True)
+        added = 0
+        for selector in selectors:
+            url = selector.get();
+            if url in self.crawled:
+                print('[' + bcolors.OKBLUE + ' skip ' + bcolors.ENDC + ']', url, flush=True)
+            else:
+                #print("YIELD", url, flush=True)
+                added += 1
+                yield scrapy.Request(url=url, callback=self.parse)
+        print('Added %d urls of %d total' % (added, len(selectors)), flush=True)
+
+    def parse(self, response):
+        res = response.css('.tutorial-content>*')
+        htmlsrc = ''
+        numclears = 0
+        for node in res:
+            block = node.get()
+            #print('check node', block, node, flush=True)
+            if block == '<div class="clear"></div>':
+                numclears += 1
+            elif numclears == 1:
+                # Extract the HTML between the first and second clear divs
+                htmlsrc += node.extract()
+        data = {
+          "text": html2text.html2text(htmlsrc).replace(u"\u00a0", " "),
+          "meta": json.dumps({
+              "source": "tutorialspoint",
+              "url": response.url,
+              "title": response.css('.qa_title::text').get(),
+              "author": response.css('.qa_author span::text').get(),
+              "author_link": response.css('.author-caret a').attrib['href'],
+              "categories": response.css('.qa_category>a>span::text').getall(),
+              "html": htmlsrc,
+              "updated_time": response.css('.qa_answer_dtm::text').get().strip().replace('Updated on ', ''),
+              "crawled_time": datetime.now().strftime('%d-%b-%Y %H:%M:%S'),
+            })
+        }
+        yield data
+        print('[' + bcolors.OKGREEN + ' save ' + bcolors.ENDC + ']', response.url, flush=True)
+
+        with open('crawled.txt', 'a+') as crawled:
+            crawled.write('%s\n' % (response.url))
+
+        links = [link.attrib['href'] for link in response.css('.toc.chapters a')]
+        for url in links:
+            if url.find('https://') == 0:
+                #print('[' + bcolors.OKCYAN + ' link ' + bcolors.ENDC + ']', url, flush=True)
+                yield scrapy.Request(url=url)
+
+    def process_item(self, item, spider):
+        pass
+
+class JsonS3WriterPipeline:
+    def open_spider(self, spider):
+        self.current_filename = None
+        self.current_file = None
+
+    def close_spider(self, spider):
+        self.close_current_file()
+
+    def get_current_filename(self):
+        # New file name every hour, for checkpointing
+        fname = datetime.now().strftime('crawled-items-%Y-%m-%d_%H.jsonl')
+        return fname
+
+    def close_current_file(self):
+        if self.current_file:
+            print('[' + bcolors.HEADER + 'close ' + bcolors.ENDC + ']', self.current_filename, flush=True)
+            old_filename = self.current_filename
+            self.current_file.close()
+            self.current_file = None
+            self.current_filename = None
+
+            zip_filename = old_filename + '.zstd'
+            try:
+                with open(old_filename, 'rb') as f_in, open(zip_filename, 'wb') as f_out:
+                    f_out.write(zstd.ZSTD_compress(f_in.read()))
+
+                os.remove(old_filename)
+
+                s3client.upload_file(zip_filename, S3_BUCKET, S3_BUCKET_PATH + zip_filename)
+                s3client.upload_file('crawled.txt', S3_BUCKET, S3_BUCKET_PATH + 'crawled.txt')
+                print('[' + bcolors.HEADER + ' sync ' + bcolors.ENDC + '] uploaded s3://%s/%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename), flush=True)
+            except IOError:
+                print('[' + bcolors.FAIL  + ' fail ' + bcolors.ENDC + '] writing file failed', self.current_filename, flush=True)
+            except botocore.exceptions.NoCredentialsError:
+                print('[' + bcolors.FAIL + ' fail ' + bcolors.ENDC + '] syncing file failed: s3://%s%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename), flush=True)
+
+
+    def get_file(self):
+        current_filename = self.get_current_filename()
+
+        if self.current_file == None or current_filename != self.current_filename:
+            self.close_current_file()
+            self.current_filename = current_filename
+            self.current_file = open(current_filename, 'a+')
+            print('[' + bcolors.HEADER + ' open ' + bcolors.ENDC + ']', self.current_filename, flush=True)
+
+        return self.current_file
+
+    def process_item(self, item, spider):
+        file = self.get_file()
+        line = json.dumps(item) + "\n"
+        file.write(line)
+        file.flush()
+        return item
+
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,7 +21,9 @@ dependencies = [
     "pandas",
     "pyarrow",
     "lxml",
-    "pyspark"
+    "pyspark",
+    "scrapy",
+    "html2text",
 ]
 
 [tool.pytest.ini_options]