From eb726f7c53a968af056ffa64ba90c67f1efa5fe6 Mon Sep 17 00:00:00 2001
From: James Baicoianu <james_github@baicoianu.com>
Date: Fri, 4 Nov 2022 13:47:33 +0000
Subject: [PATCH 1/8] Sqwitch to zstd, fix fetching crawled.txt from s3

---
 .../tutorialspoint/tutorialspoint_spider.py   | 181 ++++++++++++++++++
 1 file changed, 181 insertions(+)
 create mode 100644 codepile/tutorialspoint/tutorialspoint_spider.py
diff --git a/codepile/tutorialspoint/tutorialspoint_spider.py b/codepile/tutorialspoint/tutorialspoint_spider.py
new file mode 100644
index 0000000..514788d
--- /dev/null
+++ b/codepile/tutorialspoint/tutorialspoint_spider.py
@@ -0,0 +1,181 @@
+import scrapy
+import json
+import pathlib
+import re
+import os
+import sys
+import html2text
+import zstd
+import boto3
+import botocore.exceptions
+from datetime import datetime
+
+#S3_BUCKET = "s-eai-neox"
+S3_BUCKET = "assets.metacade.com"
+S3_BUCKET_PATH = "data/codepile/tutorialsites/tutorialspoint/"
+
+s3client = boto3.client('s3')
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+
+class TutorialspointArticleSpider(scrapy.Spider):
+    name = "tutorialspoint"
+    headers = { }
+    user_agent = 'Mozilla/5.0 (compatible; Carper-GooseBot/8000; +https://carper.ai/)'
+
+    # Keep a list of crawled URLs which we can persist across runs
+    crawled = set()
+
+    def start_requests(self):
+        sitemaps = [
+                'https://www.tutorialspoint.com/tp.xml',
+                'https://www.tutorialspoint.com/tp1.xml',
+        ]
+
+        print('Loading crawled sites list...')
+        if not os.path.isfile('crawled.txt'):
+            # Crawled URLs list not found, see if there's on in s3
+            try:
+                s3client.download_file(S3_BUCKET, S3_BUCKET_PATH + 'crawled.txt', 'crawled.txt')
+                print('got it')
+            except:
+                print('couldnt fetch crawled.txt')
+
+        # Load the list of crawled sites into a dict
+        try:
+            with open('crawled.txt', 'r') as crawled:
+                for url in crawled:
+                    #self.crawled[url.strip()] = True
+                    self.crawled.add(url.strip())
+        except:
+            print('couldnt open crawled.txt')
+            pass
+
+
+        print('Fetching sitemaps...');
+        for url in sitemaps:
+            yield scrapy.Request(url=url, callback=self.parse_sitemap)
+
+        # Uncomment to test parsing of a single article
+        #yield scrapy.Request(url="https://www.tutorialspoint.com/program-to-count-number-of-walls-required-to-partition-top-left-and-bottom-right-cells-in-python")
+
+    def parse_sitemap(self, response):
+        print('Processing sitemap:', response.url)
+        response.selector.remove_namespaces()
+        print('running xpath selector')
+        selectors = response.xpath('//loc//text()')
+        print('iterating')
+        added = 0
+        for selector in selectors:
+            url = selector.get();
+            if url in self.crawled:
+                print('[' + bcolors.OKBLUE + ' skip ' + bcolors.ENDC + ']', url)
+            else:
+                #print("YIELD", url)
+                added += 1
+                yield scrapy.Request(url=url, callback=self.parse)
+        print('Added %d urls of %d total' % (added, len(selectors)))
+
+    def parse(self, response):
+        res = response.css('.tutorial-content>*')
+        htmlsrc = ''
+        numclears = 0
+        for node in res:
+            block = node.get()
+            #print('check node', block, node)
+            if block == '<div class="clear"></div>':
+                numclears += 1
+            elif numclears == 1:
+                # Extract the HTML between the first and second clear divs
+                htmlsrc += node.extract()
+        data = {
+          "url": response.url,
+          "title": response.css('.qa_title::text').get(),
+          "author": response.css('.qa_author span::text').get(),
+          "author_link": response.css('.author-caret a').attrib['href'],
+          "categories": response.css('.qa_category>a>span::text').getall(),
+          "html": htmlsrc,
+          #"text": html2text.html2text(htmlsrc).replace(u"\u00a0", " "),
+          "updated_time": response.css('.qa_answer_dtm::text').get().strip().replace('Updated on ', ''),
+          "crawled_time": datetime.now().strftime('%d-%b-%Y %H:%M:%S'),
+        }
+        yield data
+        print('[' + bcolors.OKGREEN + ' save ' + bcolors.ENDC + ']', response.url)
+
+        with open('crawled.txt', 'a+') as crawled:
+            crawled.write('%s\n' % (response.url))
+
+        links = [link.attrib['href'] for link in response.css('.toc.chapters a')]
+        for url in links:
+            if url.find('https://') == 0:
+                #print('[' + bcolors.OKCYAN + ' link ' + bcolors.ENDC + ']', url)
+                yield scrapy.Request(url=url)
+
+    def process_item(self, item, spider):
+        pass
+
+class JsonS3WriterPipeline:
+    def open_spider(self, spider):
+        self.current_filename = None
+        self.current_file = None
+
+    def close_spider(self, spider):
+        self.close_current_file()
+
+    def get_current_filename(self):
+        # New file name every hour, for checkpointing
+        fname = datetime.now().strftime('crawled-items-%Y-%m-%d_%H_%M.jsonl')
+        return fname
+
+    def close_current_file(self):
+        if self.current_file:
+            print('[' + bcolors.HEADER + 'close ' + bcolors.ENDC + ']', self.current_filename)
+            old_filename = self.current_filename
+            self.current_file.close()
+            self.current_file = None
+            self.current_filename = None
+
+            zip_filename = old_filename + '.zstd'
+            try:
+                with open(old_filename, 'rb') as f_in, open(zip_filename, 'wb') as f_out:
+                    f_out.write(zstd.ZSTD_compress(f_in.read()))
+
+                os.remove(old_filename)
+
+                s3client.upload_file(zip_filename, S3_BUCKET, S3_BUCKET_PATH + zip_filename)
+                s3client.upload_file('crawled.txt', S3_BUCKET, S3_BUCKET_PATH + 'crawled.txt')
+                print('[' + bcolors.HEADER + ' sync ' + bcolors.ENDC + '] uploaded s3://%s/%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename))
+            except IOError:
+                print('[' + bcolors.FAIL  + ' fail ' + bcolors.ENDC + '] writing file failed', self.current_filename)
+            except botocore.exceptions.NoCredentialsError:
+                print('[' + bcolors.FAIL + ' fail ' + bcolors.ENDC + '] syncing file failed: s3://%s%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename))
+
+
+    def get_file(self):
+        current_filename = self.get_current_filename()
+
+        if self.current_file == None or current_filename != self.current_filename:
+            self.close_current_file()
+            self.current_filename = current_filename
+            self.current_file = open(current_filename, 'a+')
+            print('[' + bcolors.HEADER + ' open ' + bcolors.ENDC + ']', self.current_filename)
+
+        return self.current_file
+
+    def process_item(self, item, spider):
+        file = self.get_file()
+        line = json.dumps(item) + "\n"
+        file.write(line)
+        return item
+
+

From 48eb7a32eb0d719c2d0142861be61c627483b774 Mon Sep 17 00:00:00 2001
From: James Baicoianu <james_github@baicoianu.com>
Date: Fri, 4 Nov 2022 13:52:06 +0000
Subject: [PATCH 2/8] Accept data path as argument

---
 codepile/tutorialspoint/tutorialspoint.py | 82 +++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 codepile/tutorialspoint/tutorialspoint.py

diff --git a/codepile/tutorialspoint/tutorialspoint.py b/codepile/tutorialspoint/tutorialspoint.py
new file mode 100644
index 0000000..78be25d
--- /dev/null
+++ b/codepile/tutorialspoint/tutorialspoint.py
@@ -0,0 +1,82 @@
+from codepile.dataset import DatasetInfo, DatasetSources, RawDataset, Scraper, Processor, Analyser, Dataset
+from codepile.tutorialspoint.tutorialspoint_spider import TutorialspointArticleSpider
+from scrapy.crawler import CrawlerProcess
+import os, sys
+import pathlib
+from datetime import datetime
+
+class TutorialspointScraper(Scraper):
+    def __init__(self, tmppath, target_dir):
+        self.target_dir = target_dir
+        self.info = DatasetInfo(
+            id="Tutorialspoint Dataset",
+            description="Articles about various computing topics from TutorialsPoint.com",
+            size=3,
+            source_uri="https://tutorialspoint.com",
+            dataset_pros="",
+            dataset_cons="",
+            languages=["english"],
+            coding_languages=[],
+            modalities=[],
+            source_license="",
+            source_citation="TutorialsPoint.com",
+            data_owner="James Baicoianu",
+            contributers=["James Baicoianu"],
+            data_end=datetime(2022, 11, 3)
+       )
+
+    def download(self) -> RawDataset:
+
+        pathlib.Path(self.target_dir).mkdir(parents=True, exist_ok=True)
+        os.chdir(self.target_dir)
+
+        crawlsettings = {
+            "SCHEDULER_PRIORITY_QUEUE": 'scrapy.pqueues.DownloaderAwarePriorityQueue',
+            "CONCURRENT_REQUESTS": 1000,
+            "LOG_LEVEL": "WARN",
+            "DOWNLOAD_DELAY": .6,
+            "CONCURRENT_REQUESTS_PER_DOMAIN": 1,
+            "AUTOTHROTTLE_ENABLED": False,
+            "AUTOTHROTTLE_DEBUG": True,
+            "AUTOTHROTTLE_TARGET_CONCURRENCY": .5,
+            "REACTOR_THREADPOOL_MAXSIZE": 100,
+            'ITEM_PIPELINES': {
+                'codepile.tutorialspoint.tutorialspoint_spider.JsonS3WriterPipeline': 300,
+            }
+            #"JOBDIR": "scrapy-job",
+        }
+
+        # TODO - crawl type should be an argument we can pass in
+        crawltype = 'articles'
+
+        if crawltype == 'articles':
+            process = CrawlerProcess(crawlsettings)
+            process.crawl(TutorialspointArticleSpider)
+            process.start()
+
+        return RawDataset(storage_uris=['file:///{self.target_dir}'],
+                metadata='')
+
+
+#class TutorialspointProcessor(Processor):
+#    def process(self, raw_data: RawDataset, *args, **kwargs):
+#        # TODO - transform raw JSON data into whatever format we need for training
+#        return
+
+class TutorialspointDataset(Dataset):
+    def __init__(self, tempdir, target_dir):
+        self.scraper = TutorialspointScraper(tempdir, target_dir)
+        #self.processor = DiscourseCodeProcessor()
+    def info(self):
+        return self.info
+    
+    def id(self):
+        return self.info.id
+    def download(self):
+        self.scraper.download()
+
+if __name__=="__main__":
+    if not os.path.exists("data/"):
+            os.makedirs("data/")
+    tutorialspoint_dataset = TutorialspointDataset('/tmp', sys.argv[1])
+    print(tutorialspoint_dataset.download())

From c0510d5d22bf772ac7ced5bd8184d9fa3b29e5b5 Mon Sep 17 00:00:00 2001
From: James Baicoianu <james_github@baicoianu.com>
Date: Fri, 4 Nov 2022 22:43:07 +0000
Subject: [PATCH 3/8] Use correct s3 bucket

---
 codepile/tutorialspoint/tutorialspoint_spider.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/codepile/tutorialspoint/tutorialspoint_spider.py b/codepile/tutorialspoint/tutorialspoint_spider.py
index 514788d..a164936 100644
--- a/codepile/tutorialspoint/tutorialspoint_spider.py
+++ b/codepile/tutorialspoint/tutorialspoint_spider.py
@@ -10,8 +10,7 @@
 import botocore.exceptions
 from datetime import datetime
 
-#S3_BUCKET = "s-eai-neox"
-S3_BUCKET = "assets.metacade.com"
+S3_BUCKET = "s-eai-neox"
 S3_BUCKET_PATH = "data/codepile/tutorialsites/tutorialspoint/"
 
 s3client = boto3.client('s3')

From 002090436e74d191d6f9d3cdf7d55212f9ff3954 Mon Sep 17 00:00:00 2001
From: James Baicoianu <james_github@baicoianu.com>
Date: Fri, 4 Nov 2022 22:43:21 +0000
Subject: [PATCH 4/8] Use lm_dataset format

---
 .../tutorialspoint/tutorialspoint_spider.py   | 21 +++++++++++--------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/codepile/tutorialspoint/tutorialspoint_spider.py b/codepile/tutorialspoint/tutorialspoint_spider.py
index a164936..1efd715 100644
--- a/codepile/tutorialspoint/tutorialspoint_spider.py
+++ b/codepile/tutorialspoint/tutorialspoint_spider.py
@@ -98,15 +98,18 @@ def parse(self, response):
                 # Extract the HTML between the first and second clear divs
                 htmlsrc += node.extract()
         data = {
-          "url": response.url,
-          "title": response.css('.qa_title::text').get(),
-          "author": response.css('.qa_author span::text').get(),
-          "author_link": response.css('.author-caret a').attrib['href'],
-          "categories": response.css('.qa_category>a>span::text').getall(),
-          "html": htmlsrc,
-          #"text": html2text.html2text(htmlsrc).replace(u"\u00a0", " "),
-          "updated_time": response.css('.qa_answer_dtm::text').get().strip().replace('Updated on ', ''),
-          "crawled_time": datetime.now().strftime('%d-%b-%Y %H:%M:%S'),
+          "text": html2text.html2text(htmlsrc).replace(u"\u00a0", " "),
+          "meta": json.dumps({
+              "source": "tutorialspoint",
+              "url": response.url,
+              "title": response.css('.qa_title::text').get(),
+              "author": response.css('.qa_author span::text').get(),
+              "author_link": response.css('.author-caret a').attrib['href'],
+              "categories": response.css('.qa_category>a>span::text').getall(),
+              "html": htmlsrc,
+              "updated_time": response.css('.qa_answer_dtm::text').get().strip().replace('Updated on ', ''),
+              "crawled_time": datetime.now().strftime('%d-%b-%Y %H:%M:%S'),
+            })
         }
         yield data
         print('[' + bcolors.OKGREEN + ' save ' + bcolors.ENDC + ']', response.url)

From fac5de1a676544369d3271e8f0fb713030f26185 Mon Sep 17 00:00:00 2001
From: James Baicoianu <james_github@baicoianu.com>
Date: Fri, 4 Nov 2022 22:43:40 +0000
Subject: [PATCH 5/8] Remove minute from filename

---
 codepile/tutorialspoint/tutorialspoint_spider.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/codepile/tutorialspoint/tutorialspoint_spider.py b/codepile/tutorialspoint/tutorialspoint_spider.py
index 1efd715..f636d80 100644
--- a/codepile/tutorialspoint/tutorialspoint_spider.py
+++ b/codepile/tutorialspoint/tutorialspoint_spider.py
@@ -136,7 +136,7 @@ def close_spider(self, spider):
 
     def get_current_filename(self):
         # New file name every hour, for checkpointing
-        fname = datetime.now().strftime('crawled-items-%Y-%m-%d_%H_%M.jsonl')
+        fname = datetime.now().strftime('crawled-items-%Y-%m-%d_%H.jsonl')
         return fname
 
     def close_current_file(self):

From 49ddd2476f950e638d88a74acfd4093cfb7cdbcf Mon Sep 17 00:00:00 2001
From: James Baicoianu <james_github@baicoianu.com>
Date: Tue, 8 Nov 2022 05:53:14 +0000
Subject: [PATCH 6/8] Added scrapy and html2text dependencies

---
 pyproject.toml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index d0aad39..1e9f42c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,9 @@ dependencies = [
     "pandas",
     "pyarrow",
     "lxml",
-    "pyspark"
+    "pyspark",
+    "scrapy",
+    "html2text",
 ]
 
 [tool.pytest.ini_options]

From b57df54a6618ebeb4e450ce044dfb48ea148df8a Mon Sep 17 00:00:00 2001
From: James Baicoianu <james_github@baicoianu.com>
Date: Tue, 8 Nov 2022 05:53:35 +0000
Subject: [PATCH 7/8] Flush jsonl file to disk after each write

---
 codepile/tutorialspoint/tutorialspoint_spider.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/codepile/tutorialspoint/tutorialspoint_spider.py b/codepile/tutorialspoint/tutorialspoint_spider.py
index f636d80..21a05b1 100644
--- a/codepile/tutorialspoint/tutorialspoint_spider.py
+++ b/codepile/tutorialspoint/tutorialspoint_spider.py
@@ -178,6 +178,7 @@ def process_item(self, item, spider):
         file = self.get_file()
         line = json.dumps(item) + "\n"
         file.write(line)
+        file.flush()
         return item
 
 

From 22805857971239e9e27b8ecbd3faf3c5f91d36fc Mon Sep 17 00:00:00 2001
From: James Baicoianu <james_github@baicoianu.com>
Date: Tue, 8 Nov 2022 05:56:50 +0000
Subject: [PATCH 8/8] Fluish stdout after each print

---
 .../tutorialspoint/tutorialspoint_spider.py   | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/codepile/tutorialspoint/tutorialspoint_spider.py b/codepile/tutorialspoint/tutorialspoint_spider.py
index 21a05b1..0a75172 100644
--- a/codepile/tutorialspoint/tutorialspoint_spider.py
+++ b/codepile/tutorialspoint/tutorialspoint_spider.py
@@ -41,14 +41,14 @@ def start_requests(self):
                 'https://www.tutorialspoint.com/tp1.xml',
         ]
 
-        print('Loading crawled sites list...')
+        print('Loading crawled sites list...', flush=True)
         if not os.path.isfile('crawled.txt'):
             # Crawled URLs list not found, see if there's on in s3
             try:
                 s3client.download_file(S3_BUCKET, S3_BUCKET_PATH + 'crawled.txt', 'crawled.txt')
-                print('got it')
+                print('got it', flush=True)
             except:
-                print('couldnt fetch crawled.txt')
+                print('couldnt fetch crawled.txt', flush=True)
 
         # Load the list of crawled sites into a dict
         try:
@@ -57,11 +57,11 @@ def start_requests(self):
                     #self.crawled[url.strip()] = True
                     self.crawled.add(url.strip())
         except:
-            print('couldnt open crawled.txt')
+            print('couldnt open crawled.txt', flush=True)
             pass
 
 
-        print('Fetching sitemaps...');
+        print('Fetching sitemaps...', flush=True);
         for url in sitemaps:
             yield scrapy.Request(url=url, callback=self.parse_sitemap)
 
@@ -69,21 +69,21 @@ def start_requests(self):
         #yield scrapy.Request(url="https://www.tutorialspoint.com/program-to-count-number-of-walls-required-to-partition-top-left-and-bottom-right-cells-in-python")
 
     def parse_sitemap(self, response):
-        print('Processing sitemap:', response.url)
+        print('Processing sitemap:', response.url, flush=True)
         response.selector.remove_namespaces()
-        print('running xpath selector')
+        print('running xpath selector', flush=True)
         selectors = response.xpath('//loc//text()')
-        print('iterating')
+        print('iterating', flush=True)
         added = 0
         for selector in selectors:
             url = selector.get();
             if url in self.crawled:
-                print('[' + bcolors.OKBLUE + ' skip ' + bcolors.ENDC + ']', url)
+                print('[' + bcolors.OKBLUE + ' skip ' + bcolors.ENDC + ']', url, flush=True)
             else:
-                #print("YIELD", url)
+                #print("YIELD", url, flush=True)
                 added += 1
                 yield scrapy.Request(url=url, callback=self.parse)
-        print('Added %d urls of %d total' % (added, len(selectors)))
+        print('Added %d urls of %d total' % (added, len(selectors)), flush=True)
 
     def parse(self, response):
         res = response.css('.tutorial-content>*')
@@ -91,7 +91,7 @@ def parse(self, response):
         numclears = 0
         for node in res:
             block = node.get()
-            #print('check node', block, node)
+            #print('check node', block, node, flush=True)
             if block == '<div class="clear"></div>':
                 numclears += 1
             elif numclears == 1:
@@ -112,7 +112,7 @@ def parse(self, response):
             })
         }
         yield data
-        print('[' + bcolors.OKGREEN + ' save ' + bcolors.ENDC + ']', response.url)
+        print('[' + bcolors.OKGREEN + ' save ' + bcolors.ENDC + ']', response.url, flush=True)
 
         with open('crawled.txt', 'a+') as crawled:
             crawled.write('%s\n' % (response.url))
@@ -120,7 +120,7 @@ def parse(self, response):
         links = [link.attrib['href'] for link in response.css('.toc.chapters a')]
         for url in links:
             if url.find('https://') == 0:
-                #print('[' + bcolors.OKCYAN + ' link ' + bcolors.ENDC + ']', url)
+                #print('[' + bcolors.OKCYAN + ' link ' + bcolors.ENDC + ']', url, flush=True)
                 yield scrapy.Request(url=url)
 
     def process_item(self, item, spider):
@@ -141,7 +141,7 @@ def get_current_filename(self):
 
     def close_current_file(self):
         if self.current_file:
-            print('[' + bcolors.HEADER + 'close ' + bcolors.ENDC + ']', self.current_filename)
+            print('[' + bcolors.HEADER + 'close ' + bcolors.ENDC + ']', self.current_filename, flush=True)
             old_filename = self.current_filename
             self.current_file.close()
             self.current_file = None
@@ -156,11 +156,11 @@ def close_current_file(self):
 
                 s3client.upload_file(zip_filename, S3_BUCKET, S3_BUCKET_PATH + zip_filename)
                 s3client.upload_file('crawled.txt', S3_BUCKET, S3_BUCKET_PATH + 'crawled.txt')
-                print('[' + bcolors.HEADER + ' sync ' + bcolors.ENDC + '] uploaded s3://%s/%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename))
+                print('[' + bcolors.HEADER + ' sync ' + bcolors.ENDC + '] uploaded s3://%s/%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename), flush=True)
             except IOError:
-                print('[' + bcolors.FAIL  + ' fail ' + bcolors.ENDC + '] writing file failed', self.current_filename)
+                print('[' + bcolors.FAIL  + ' fail ' + bcolors.ENDC + '] writing file failed', self.current_filename, flush=True)
             except botocore.exceptions.NoCredentialsError:
-                print('[' + bcolors.FAIL + ' fail ' + bcolors.ENDC + '] syncing file failed: s3://%s%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename))
+                print('[' + bcolors.FAIL + ' fail ' + bcolors.ENDC + '] syncing file failed: s3://%s%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename), flush=True)
 
 
     def get_file(self):
@@ -170,7 +170,7 @@ def get_file(self):
             self.close_current_file()
             self.current_filename = current_filename
             self.current_file = open(current_filename, 'a+')
-            print('[' + bcolors.HEADER + ' open ' + bcolors.ENDC + ']', self.current_filename)
+            print('[' + bcolors.HEADER + ' open ' + bcolors.ENDC + ']', self.current_filename, flush=True)
 
         return self.current_file