Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions codepile/tutorialspoint/tutorialspoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from codepile.dataset import DatasetInfo, DatasetSources, RawDataset, Scraper, Processor, Analyser, Dataset
from codepile.tutorialspoint.tutorialspoint_spider import TutorialspointArticleSpider
from scrapy.crawler import CrawlerProcess
import os, sys
import pathlib
from datetime import datetime

class TutorialspointScraper(Scraper):
def __init__(self, tmppath, target_dir):
self.target_dir = target_dir
self.info = DatasetInfo(
id="Tutorialspoint Dataset",
description="Articles about various computing topics from TutorialsPoint.com",
size=3,
source_uri="https://tutorialspoint.com",
dataset_pros="",
dataset_cons="",
languages=["english"],
coding_languages=[],
modalities=[],
source_license="",
source_citation="TutorialsPoint.com",
data_owner="James Baicoianu",
contributers=["James Baicoianu"],
data_end=datetime(2022, 11, 3)
)

def download(self) -> RawDataset:

pathlib.Path(self.target_dir).mkdir(parents=True, exist_ok=True)
os.chdir(self.target_dir)

crawlsettings = {
"SCHEDULER_PRIORITY_QUEUE": 'scrapy.pqueues.DownloaderAwarePriorityQueue',
"CONCURRENT_REQUESTS": 1000,
"LOG_LEVEL": "WARN",
"DOWNLOAD_DELAY": .6,
"CONCURRENT_REQUESTS_PER_DOMAIN": 1,
"AUTOTHROTTLE_ENABLED": False,
"AUTOTHROTTLE_DEBUG": True,
"AUTOTHROTTLE_TARGET_CONCURRENCY": .5,
"REACTOR_THREADPOOL_MAXSIZE": 100,
'ITEM_PIPELINES': {
'codepile.tutorialspoint.tutorialspoint_spider.JsonS3WriterPipeline': 300,
}
#"JOBDIR": "scrapy-job",
}

# TODO - crawl type should be an argument we can pass in
crawltype = 'articles'

if crawltype == 'articles':
process = CrawlerProcess(crawlsettings)
process.crawl(TutorialspointArticleSpider)
process.start()

return RawDataset(storage_uris=['file:///{self.target_dir}'],
metadata='')


#class TutorialspointProcessor(Processor):
# def process(self, raw_data: RawDataset, *args, **kwargs):
# # TODO - transform raw JSON data into whatever format we need for training
# return

class TutorialspointDataset(Dataset):
def __init__(self, tempdir, target_dir):
self.scraper = TutorialspointScraper(tempdir, target_dir)
#self.processor = DiscourseCodeProcessor()
def info(self):
return self.info

def id(self):
return self.info.id
def download(self):
self.scraper.download()

if __name__=="__main__":
if not os.path.exists("data/"):
os.makedirs("data/")
tutorialspoint_dataset = TutorialspointDataset('/tmp', sys.argv[1])
print(tutorialspoint_dataset.download())
184 changes: 184 additions & 0 deletions codepile/tutorialspoint/tutorialspoint_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import scrapy
import json
import pathlib
import re
import os
import sys
import html2text
import zstd
import boto3
import botocore.exceptions
from datetime import datetime

S3_BUCKET = "s-eai-neox"
S3_BUCKET_PATH = "data/codepile/tutorialsites/tutorialspoint/"

s3client = boto3.client('s3')

class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKCYAN = '\033[96m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'


class TutorialspointArticleSpider(scrapy.Spider):
name = "tutorialspoint"
headers = { }
user_agent = 'Mozilla/5.0 (compatible; Carper-GooseBot/8000; +https://carper.ai/)'

# Keep a list of crawled URLs which we can persist across runs
crawled = set()

def start_requests(self):
sitemaps = [
'https://www.tutorialspoint.com/tp.xml',
'https://www.tutorialspoint.com/tp1.xml',
]

print('Loading crawled sites list...', flush=True)
if not os.path.isfile('crawled.txt'):
# Crawled URLs list not found, see if there's on in s3
try:
s3client.download_file(S3_BUCKET, S3_BUCKET_PATH + 'crawled.txt', 'crawled.txt')
print('got it', flush=True)
except:
print('couldnt fetch crawled.txt', flush=True)

# Load the list of crawled sites into a dict
try:
with open('crawled.txt', 'r') as crawled:
for url in crawled:
#self.crawled[url.strip()] = True
self.crawled.add(url.strip())
except:
print('couldnt open crawled.txt', flush=True)
pass


print('Fetching sitemaps...', flush=True);
for url in sitemaps:
yield scrapy.Request(url=url, callback=self.parse_sitemap)

# Uncomment to test parsing of a single article
#yield scrapy.Request(url="https://www.tutorialspoint.com/program-to-count-number-of-walls-required-to-partition-top-left-and-bottom-right-cells-in-python")

def parse_sitemap(self, response):
print('Processing sitemap:', response.url, flush=True)
response.selector.remove_namespaces()
print('running xpath selector', flush=True)
selectors = response.xpath('//loc//text()')
print('iterating', flush=True)
added = 0
for selector in selectors:
url = selector.get();
if url in self.crawled:
print('[' + bcolors.OKBLUE + ' skip ' + bcolors.ENDC + ']', url, flush=True)
else:
#print("YIELD", url, flush=True)
added += 1
yield scrapy.Request(url=url, callback=self.parse)
print('Added %d urls of %d total' % (added, len(selectors)), flush=True)

def parse(self, response):
res = response.css('.tutorial-content>*')
htmlsrc = ''
numclears = 0
for node in res:
block = node.get()
#print('check node', block, node, flush=True)
if block == '<div class="clear"></div>':
numclears += 1
elif numclears == 1:
# Extract the HTML between the first and second clear divs
htmlsrc += node.extract()
data = {
"text": html2text.html2text(htmlsrc).replace(u"\u00a0", " "),
"meta": json.dumps({
"source": "tutorialspoint",
"url": response.url,
"title": response.css('.qa_title::text').get(),
"author": response.css('.qa_author span::text').get(),
"author_link": response.css('.author-caret a').attrib['href'],
"categories": response.css('.qa_category>a>span::text').getall(),
"html": htmlsrc,
"updated_time": response.css('.qa_answer_dtm::text').get().strip().replace('Updated on ', ''),
"crawled_time": datetime.now().strftime('%d-%b-%Y %H:%M:%S'),
})
}
yield data
print('[' + bcolors.OKGREEN + ' save ' + bcolors.ENDC + ']', response.url, flush=True)

with open('crawled.txt', 'a+') as crawled:
crawled.write('%s\n' % (response.url))

links = [link.attrib['href'] for link in response.css('.toc.chapters a')]
for url in links:
if url.find('https://') == 0:
#print('[' + bcolors.OKCYAN + ' link ' + bcolors.ENDC + ']', url, flush=True)
yield scrapy.Request(url=url)

def process_item(self, item, spider):
pass

class JsonS3WriterPipeline:
def open_spider(self, spider):
self.current_filename = None
self.current_file = None

def close_spider(self, spider):
self.close_current_file()

def get_current_filename(self):
# New file name every hour, for checkpointing
fname = datetime.now().strftime('crawled-items-%Y-%m-%d_%H.jsonl')
return fname

def close_current_file(self):
if self.current_file:
print('[' + bcolors.HEADER + 'close ' + bcolors.ENDC + ']', self.current_filename, flush=True)
old_filename = self.current_filename
self.current_file.close()
self.current_file = None
self.current_filename = None

zip_filename = old_filename + '.zstd'
try:
with open(old_filename, 'rb') as f_in, open(zip_filename, 'wb') as f_out:
f_out.write(zstd.ZSTD_compress(f_in.read()))

os.remove(old_filename)

s3client.upload_file(zip_filename, S3_BUCKET, S3_BUCKET_PATH + zip_filename)
s3client.upload_file('crawled.txt', S3_BUCKET, S3_BUCKET_PATH + 'crawled.txt')
print('[' + bcolors.HEADER + ' sync ' + bcolors.ENDC + '] uploaded s3://%s/%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename), flush=True)
except IOError:
print('[' + bcolors.FAIL + ' fail ' + bcolors.ENDC + '] writing file failed', self.current_filename, flush=True)
except botocore.exceptions.NoCredentialsError:
print('[' + bcolors.FAIL + ' fail ' + bcolors.ENDC + '] syncing file failed: s3://%s%s%s' % (S3_BUCKET, S3_BUCKET_PATH, zip_filename), flush=True)


def get_file(self):
current_filename = self.get_current_filename()

if self.current_file == None or current_filename != self.current_filename:
self.close_current_file()
self.current_filename = current_filename
self.current_file = open(current_filename, 'a+')
print('[' + bcolors.HEADER + ' open ' + bcolors.ENDC + ']', self.current_filename, flush=True)

return self.current_file

def process_item(self, item, spider):
file = self.get_file()
line = json.dumps(item) + "\n"
file.write(line)
file.flush()
return item


4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ dependencies = [
"pandas",
"pyarrow",
"lxml",
"pyspark"
"pyspark",
"scrapy",
"html2text",
]

[tool.pytest.ini_options]
Expand Down