From eb726f7c53a968af056ffa64ba90c67f1efa5fe6 Mon Sep 17 00:00:00 2001 From: James Baicoianu Date: Fri, 4 Nov 2022 13:47:33 +0000 Subject: [PATCH 1/8] Sqwitch to zstd, fix fetching crawled.txt from s3 --- .../tutorialspoint/tutorialspoint_spider.py | 181 ++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 codepile/tutorialspoint/tutorialspoint_spider.py diff --git a/codepile/tutorialspoint/tutorialspoint_spider.py b/codepile/tutorialspoint/tutorialspoint_spider.py new file mode 100644 index 0000000..514788d --- /dev/null +++ b/codepile/tutorialspoint/tutorialspoint_spider.py @@ -0,0 +1,181 @@ +import scrapy +import json +import pathlib +import re +import os +import sys +import html2text +import zstd +import boto3 +import botocore.exceptions +from datetime import datetime + +#S3_BUCKET = "s-eai-neox" +S3_BUCKET = "assets.metacade.com" +S3_BUCKET_PATH = "data/codepile/tutorialsites/tutorialspoint/" + +s3client = boto3.client('s3') + +class bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKCYAN = '\033[96m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + + +class TutorialspointArticleSpider(scrapy.Spider): + name = "tutorialspoint" + headers = { } + user_agent = 'Mozilla/5.0 (compatible; Carper-GooseBot/8000; +https://carper.ai/)' + + # Keep a list of crawled URLs which we can persist across runs + crawled = set() + + def start_requests(self): + sitemaps = [ + 'https://www.tutorialspoint.com/tp.xml', + 'https://www.tutorialspoint.com/tp1.xml', + ] + + print('Loading crawled sites list...') + if not os.path.isfile('crawled.txt'): + # Crawled URLs list not found, see if there's on in s3 + try: + s3client.download_file(S3_BUCKET, S3_BUCKET_PATH + 'crawled.txt', 'crawled.txt') + print('got it') + except: + print('couldnt fetch crawled.txt') + + # Load the list of crawled sites into a dict + try: + with open('crawled.txt', 'r') as crawled: + for url in crawled: + #self.crawled[url.strip()] = True + self.crawled.add(url.strip()) + except: + print('couldnt open crawled.txt') + pass + + + print('Fetching sitemaps...'); + for url in sitemaps: + yield scrapy.Request(url=url, callback=self.parse_sitemap) + + # Uncomment to test parsing of a single article + #yield scrapy.Request(url="https://www.tutorialspoint.com/program-to-count-number-of-walls-required-to-partition-top-left-and-bottom-right-cells-in-python") + + def parse_sitemap(self, response): + print('Processing sitemap:', response.url) + response.selector.remove_namespaces() + print('running xpath selector') + selectors = response.xpath('//loc//text()') + print('iterating') + added = 0 + for selector in selectors: + url = selector.get(); + if url in self.crawled: + print('[' + bcolors.OKBLUE + ' skip ' + bcolors.ENDC + ']', url) + else: + #print("YIELD", url) + added += 1 + yield scrapy.Request(url=url, callback=self.parse) + print('Added %d urls of %d total' % (added, len(selectors))) + + def parse(self, response): + res = response.css('.tutorial-content>*') + htmlsrc = '' + numclears = 0 + for node in res: + block = node.get() + #print('check node', block, node) + if block == '
': + numclears += 1 + elif numclears == 1: + # Extract the HTML between the first and second clear divs + htmlsrc += node.extract() + data = { + "url": response.url, + "title": response.css('.qa_title::text').get(), + "author": response.css('.qa_author span::text').get(), + "author_link": response.css('.author-caret a').attrib['href'], + "categories": response.css('.qa_category>a>span::text').getall(), + "html": htmlsrc, + #"text": html2text.html2text(htmlsrc).replace(u"\u00a0", " "), + "updated_time": response.css('.qa_answer_dtm::text').get().strip().replace('Updated on ', ''), + "crawled_time": datetime.now().strftime('%d-%b-%Y %H:%M:%S'), + } + yield data + print('[' + bcolors.OKGREEN + ' save ' + bcolors.ENDC + ']', response.url) + + with open('crawled.txt', 'a+') as crawled: + crawled.write('%s\n' % (response.url)) + + links = [link.attrib['href'] for link in response.css('.toc.chapters a')] + for url in links: + if url.find('https://') == 0: + #print('[' + bcolors.OKCYAN + ' link ' + bcolors.ENDC + ']', url) + yield scrapy.Request(url=url) + + def process_item(self, item, spider): + pass + +class JsonS3WriterPipeline: + def open_spider(self, spider): + self.current_filename = None + self.current_file = None + + def close_spider(self, spider): + self.close_current_file() + + def get_current_filename(self): + # New file name every hour, for checkpointing + fname = datetime.now().strftime('crawled-items-%Y-%m-%d_%H_%M.jsonl') + return fname + + def close_current_file(self): + if self.current_file: + print('[' + bcolors.HEADER + 'close ' + bcolors.ENDC + ']', self.current_filename) + old_filename = self.current_filename + self.current_file.close() + self.current_file = None + self.current_filename = None + + zip_filename = old_filename + '.zstd' + try: + with open(old_filename, 'rb') as f_in, open(zip_filename, 'wb') as f_out: + f_out.write(zstd.ZSTD_compress(f_in.read())) + + os.remove(old_filename) + + s3client.upload_file(zip_filename, S3_BUCKET, S3_BUCKET_PATH + zip_filename) + s3client.upload_file('crawled.txt', S3_BUCKET, S3_BUCKET_PATH + 'crawled.txt') + print('[' + bcolors.HEADER + ' sync ' + bcolors.ENDC + '] uploaded s3://%s/%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename)) + except IOError: + print('[' + bcolors.FAIL + ' fail ' + bcolors.ENDC + '] writing file failed', self.current_filename) + except botocore.exceptions.NoCredentialsError: + print('[' + bcolors.FAIL + ' fail ' + bcolors.ENDC + '] syncing file failed: s3://%s%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename)) + + + def get_file(self): + current_filename = self.get_current_filename() + + if self.current_file == None or current_filename != self.current_filename: + self.close_current_file() + self.current_filename = current_filename + self.current_file = open(current_filename, 'a+') + print('[' + bcolors.HEADER + ' open ' + bcolors.ENDC + ']', self.current_filename) + + return self.current_file + + def process_item(self, item, spider): + file = self.get_file() + line = json.dumps(item) + "\n" + file.write(line) + return item + + From 48eb7a32eb0d719c2d0142861be61c627483b774 Mon Sep 17 00:00:00 2001 From: James Baicoianu Date: Fri, 4 Nov 2022 13:52:06 +0000 Subject: [PATCH 2/8] Accept data path as argument --- codepile/tutorialspoint/tutorialspoint.py | 82 +++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 codepile/tutorialspoint/tutorialspoint.py diff --git a/codepile/tutorialspoint/tutorialspoint.py b/codepile/tutorialspoint/tutorialspoint.py new file mode 100644 index 0000000..78be25d --- /dev/null +++ b/codepile/tutorialspoint/tutorialspoint.py @@ -0,0 +1,82 @@ +from codepile.dataset import DatasetInfo, DatasetSources, RawDataset, Scraper, Processor, Analyser, Dataset +from codepile.tutorialspoint.tutorialspoint_spider import TutorialspointArticleSpider +from scrapy.crawler import CrawlerProcess +import os, sys +import pathlib +from datetime import datetime + +class TutorialspointScraper(Scraper): + def __init__(self, tmppath, target_dir): + self.target_dir = target_dir + self.info = DatasetInfo( + id="Tutorialspoint Dataset", + description="Articles about various computing topics from TutorialsPoint.com", + size=3, + source_uri="https://tutorialspoint.com", + dataset_pros="", + dataset_cons="", + languages=["english"], + coding_languages=[], + modalities=[], + source_license="", + source_citation="TutorialsPoint.com", + data_owner="James Baicoianu", + contributers=["James Baicoianu"], + data_end=datetime(2022, 11, 3) + ) + + def download(self) -> RawDataset: + + pathlib.Path(self.target_dir).mkdir(parents=True, exist_ok=True) + os.chdir(self.target_dir) + + crawlsettings = { + "SCHEDULER_PRIORITY_QUEUE": 'scrapy.pqueues.DownloaderAwarePriorityQueue', + "CONCURRENT_REQUESTS": 1000, + "LOG_LEVEL": "WARN", + "DOWNLOAD_DELAY": .6, + "CONCURRENT_REQUESTS_PER_DOMAIN": 1, + "AUTOTHROTTLE_ENABLED": False, + "AUTOTHROTTLE_DEBUG": True, + "AUTOTHROTTLE_TARGET_CONCURRENCY": .5, + "REACTOR_THREADPOOL_MAXSIZE": 100, + 'ITEM_PIPELINES': { + 'codepile.tutorialspoint.tutorialspoint_spider.JsonS3WriterPipeline': 300, + } + #"JOBDIR": "scrapy-job", + } + + # TODO - crawl type should be an argument we can pass in + crawltype = 'articles' + + if crawltype == 'articles': + process = CrawlerProcess(crawlsettings) + process.crawl(TutorialspointArticleSpider) + process.start() + + return RawDataset(storage_uris=['file:///{self.target_dir}'], + metadata='') + + +#class TutorialspointProcessor(Processor): +# def process(self, raw_data: RawDataset, *args, **kwargs): +# # TODO - transform raw JSON data into whatever format we need for training +# return + +class TutorialspointDataset(Dataset): + def __init__(self, tempdir, target_dir): + self.scraper = TutorialspointScraper(tempdir, target_dir) + #self.processor = DiscourseCodeProcessor() + def info(self): + return self.info + + def id(self): + return self.info.id + def download(self): + self.scraper.download() + +if __name__=="__main__": + if not os.path.exists("data/"): + os.makedirs("data/") + tutorialspoint_dataset = TutorialspointDataset('/tmp', sys.argv[1]) + print(tutorialspoint_dataset.download()) From c0510d5d22bf772ac7ced5bd8184d9fa3b29e5b5 Mon Sep 17 00:00:00 2001 From: James Baicoianu Date: Fri, 4 Nov 2022 22:43:07 +0000 Subject: [PATCH 3/8] Use correct s3 bucket --- codepile/tutorialspoint/tutorialspoint_spider.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/codepile/tutorialspoint/tutorialspoint_spider.py b/codepile/tutorialspoint/tutorialspoint_spider.py index 514788d..a164936 100644 --- a/codepile/tutorialspoint/tutorialspoint_spider.py +++ b/codepile/tutorialspoint/tutorialspoint_spider.py @@ -10,8 +10,7 @@ import botocore.exceptions from datetime import datetime -#S3_BUCKET = "s-eai-neox" -S3_BUCKET = "assets.metacade.com" +S3_BUCKET = "s-eai-neox" S3_BUCKET_PATH = "data/codepile/tutorialsites/tutorialspoint/" s3client = boto3.client('s3') From 002090436e74d191d6f9d3cdf7d55212f9ff3954 Mon Sep 17 00:00:00 2001 From: James Baicoianu Date: Fri, 4 Nov 2022 22:43:21 +0000 Subject: [PATCH 4/8] Use lm_dataset format --- .../tutorialspoint/tutorialspoint_spider.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/codepile/tutorialspoint/tutorialspoint_spider.py b/codepile/tutorialspoint/tutorialspoint_spider.py index a164936..1efd715 100644 --- a/codepile/tutorialspoint/tutorialspoint_spider.py +++ b/codepile/tutorialspoint/tutorialspoint_spider.py @@ -98,15 +98,18 @@ def parse(self, response): # Extract the HTML between the first and second clear divs htmlsrc += node.extract() data = { - "url": response.url, - "title": response.css('.qa_title::text').get(), - "author": response.css('.qa_author span::text').get(), - "author_link": response.css('.author-caret a').attrib['href'], - "categories": response.css('.qa_category>a>span::text').getall(), - "html": htmlsrc, - #"text": html2text.html2text(htmlsrc).replace(u"\u00a0", " "), - "updated_time": response.css('.qa_answer_dtm::text').get().strip().replace('Updated on ', ''), - "crawled_time": datetime.now().strftime('%d-%b-%Y %H:%M:%S'), + "text": html2text.html2text(htmlsrc).replace(u"\u00a0", " "), + "meta": json.dumps({ + "source": "tutorialspoint", + "url": response.url, + "title": response.css('.qa_title::text').get(), + "author": response.css('.qa_author span::text').get(), + "author_link": response.css('.author-caret a').attrib['href'], + "categories": response.css('.qa_category>a>span::text').getall(), + "html": htmlsrc, + "updated_time": response.css('.qa_answer_dtm::text').get().strip().replace('Updated on ', ''), + "crawled_time": datetime.now().strftime('%d-%b-%Y %H:%M:%S'), + }) } yield data print('[' + bcolors.OKGREEN + ' save ' + bcolors.ENDC + ']', response.url) From fac5de1a676544369d3271e8f0fb713030f26185 Mon Sep 17 00:00:00 2001 From: James Baicoianu Date: Fri, 4 Nov 2022 22:43:40 +0000 Subject: [PATCH 5/8] Remove minute from filename --- codepile/tutorialspoint/tutorialspoint_spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codepile/tutorialspoint/tutorialspoint_spider.py b/codepile/tutorialspoint/tutorialspoint_spider.py index 1efd715..f636d80 100644 --- a/codepile/tutorialspoint/tutorialspoint_spider.py +++ b/codepile/tutorialspoint/tutorialspoint_spider.py @@ -136,7 +136,7 @@ def close_spider(self, spider): def get_current_filename(self): # New file name every hour, for checkpointing - fname = datetime.now().strftime('crawled-items-%Y-%m-%d_%H_%M.jsonl') + fname = datetime.now().strftime('crawled-items-%Y-%m-%d_%H.jsonl') return fname def close_current_file(self): From 49ddd2476f950e638d88a74acfd4093cfb7cdbcf Mon Sep 17 00:00:00 2001 From: James Baicoianu Date: Tue, 8 Nov 2022 05:53:14 +0000 Subject: [PATCH 6/8] Added scrapy and html2text dependencies --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d0aad39..1e9f42c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,9 @@ dependencies = [ "pandas", "pyarrow", "lxml", - "pyspark" + "pyspark", + "scrapy", + "html2text", ] [tool.pytest.ini_options] From b57df54a6618ebeb4e450ce044dfb48ea148df8a Mon Sep 17 00:00:00 2001 From: James Baicoianu Date: Tue, 8 Nov 2022 05:53:35 +0000 Subject: [PATCH 7/8] Flush jsonl file to disk after each write --- codepile/tutorialspoint/tutorialspoint_spider.py | 1 + 1 file changed, 1 insertion(+) diff --git a/codepile/tutorialspoint/tutorialspoint_spider.py b/codepile/tutorialspoint/tutorialspoint_spider.py index f636d80..21a05b1 100644 --- a/codepile/tutorialspoint/tutorialspoint_spider.py +++ b/codepile/tutorialspoint/tutorialspoint_spider.py @@ -178,6 +178,7 @@ def process_item(self, item, spider): file = self.get_file() line = json.dumps(item) + "\n" file.write(line) + file.flush() return item From 22805857971239e9e27b8ecbd3faf3c5f91d36fc Mon Sep 17 00:00:00 2001 From: James Baicoianu Date: Tue, 8 Nov 2022 05:56:50 +0000 Subject: [PATCH 8/8] Fluish stdout after each print --- .../tutorialspoint/tutorialspoint_spider.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/codepile/tutorialspoint/tutorialspoint_spider.py b/codepile/tutorialspoint/tutorialspoint_spider.py index 21a05b1..0a75172 100644 --- a/codepile/tutorialspoint/tutorialspoint_spider.py +++ b/codepile/tutorialspoint/tutorialspoint_spider.py @@ -41,14 +41,14 @@ def start_requests(self): 'https://www.tutorialspoint.com/tp1.xml', ] - print('Loading crawled sites list...') + print('Loading crawled sites list...', flush=True) if not os.path.isfile('crawled.txt'): # Crawled URLs list not found, see if there's on in s3 try: s3client.download_file(S3_BUCKET, S3_BUCKET_PATH + 'crawled.txt', 'crawled.txt') - print('got it') + print('got it', flush=True) except: - print('couldnt fetch crawled.txt') + print('couldnt fetch crawled.txt', flush=True) # Load the list of crawled sites into a dict try: @@ -57,11 +57,11 @@ def start_requests(self): #self.crawled[url.strip()] = True self.crawled.add(url.strip()) except: - print('couldnt open crawled.txt') + print('couldnt open crawled.txt', flush=True) pass - print('Fetching sitemaps...'); + print('Fetching sitemaps...', flush=True); for url in sitemaps: yield scrapy.Request(url=url, callback=self.parse_sitemap) @@ -69,21 +69,21 @@ def start_requests(self): #yield scrapy.Request(url="https://www.tutorialspoint.com/program-to-count-number-of-walls-required-to-partition-top-left-and-bottom-right-cells-in-python") def parse_sitemap(self, response): - print('Processing sitemap:', response.url) + print('Processing sitemap:', response.url, flush=True) response.selector.remove_namespaces() - print('running xpath selector') + print('running xpath selector', flush=True) selectors = response.xpath('//loc//text()') - print('iterating') + print('iterating', flush=True) added = 0 for selector in selectors: url = selector.get(); if url in self.crawled: - print('[' + bcolors.OKBLUE + ' skip ' + bcolors.ENDC + ']', url) + print('[' + bcolors.OKBLUE + ' skip ' + bcolors.ENDC + ']', url, flush=True) else: - #print("YIELD", url) + #print("YIELD", url, flush=True) added += 1 yield scrapy.Request(url=url, callback=self.parse) - print('Added %d urls of %d total' % (added, len(selectors))) + print('Added %d urls of %d total' % (added, len(selectors)), flush=True) def parse(self, response): res = response.css('.tutorial-content>*') @@ -91,7 +91,7 @@ def parse(self, response): numclears = 0 for node in res: block = node.get() - #print('check node', block, node) + #print('check node', block, node, flush=True) if block == '
': numclears += 1 elif numclears == 1: @@ -112,7 +112,7 @@ def parse(self, response): }) } yield data - print('[' + bcolors.OKGREEN + ' save ' + bcolors.ENDC + ']', response.url) + print('[' + bcolors.OKGREEN + ' save ' + bcolors.ENDC + ']', response.url, flush=True) with open('crawled.txt', 'a+') as crawled: crawled.write('%s\n' % (response.url)) @@ -120,7 +120,7 @@ def parse(self, response): links = [link.attrib['href'] for link in response.css('.toc.chapters a')] for url in links: if url.find('https://') == 0: - #print('[' + bcolors.OKCYAN + ' link ' + bcolors.ENDC + ']', url) + #print('[' + bcolors.OKCYAN + ' link ' + bcolors.ENDC + ']', url, flush=True) yield scrapy.Request(url=url) def process_item(self, item, spider): @@ -141,7 +141,7 @@ def get_current_filename(self): def close_current_file(self): if self.current_file: - print('[' + bcolors.HEADER + 'close ' + bcolors.ENDC + ']', self.current_filename) + print('[' + bcolors.HEADER + 'close ' + bcolors.ENDC + ']', self.current_filename, flush=True) old_filename = self.current_filename self.current_file.close() self.current_file = None @@ -156,11 +156,11 @@ def close_current_file(self): s3client.upload_file(zip_filename, S3_BUCKET, S3_BUCKET_PATH + zip_filename) s3client.upload_file('crawled.txt', S3_BUCKET, S3_BUCKET_PATH + 'crawled.txt') - print('[' + bcolors.HEADER + ' sync ' + bcolors.ENDC + '] uploaded s3://%s/%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename)) + print('[' + bcolors.HEADER + ' sync ' + bcolors.ENDC + '] uploaded s3://%s/%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename), flush=True) except IOError: - print('[' + bcolors.FAIL + ' fail ' + bcolors.ENDC + '] writing file failed', self.current_filename) + print('[' + bcolors.FAIL + ' fail ' + bcolors.ENDC + '] writing file failed', self.current_filename, flush=True) except botocore.exceptions.NoCredentialsError: - print('[' + bcolors.FAIL + ' fail ' + bcolors.ENDC + '] syncing file failed: s3://%s%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename)) + print('[' + bcolors.FAIL + ' fail ' + bcolors.ENDC + '] syncing file failed: s3://%s%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename), flush=True) def get_file(self): @@ -170,7 +170,7 @@ def get_file(self): self.close_current_file() self.current_filename = current_filename self.current_file = open(current_filename, 'a+') - print('[' + bcolors.HEADER + ' open ' + bcolors.ENDC + ']', self.current_filename) + print('[' + bcolors.HEADER + ' open ' + bcolors.ENDC + ']', self.current_filename, flush=True) return self.current_file