Skip to content

Commit 8af3ad2

Browse files
committed
[UPDATE]
* deletion of node_modules/ dir * add a scraping engine and 2 default scraper * add one example of usage * [WIP] : write test, add full support of Flask and Schedule, write some DOCS
1 parent 42e25f7 commit 8af3ad2

File tree

387 files changed

+1082
-35933
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

387 files changed

+1082
-35933
lines changed

.gitignore

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
2+
# Created by https://www.gitignore.io/api/sonar,flask,python,sonarqube
3+
# Edit at https://www.gitignore.io/?templates=sonar,flask,python,sonarqube
4+
5+
### Flask ###
6+
instance/*
7+
!instance/.gitignore
8+
.webassets-cache
9+
10+
### Flask.Python Stack ###
11+
# Byte-compiled / optimized / DLL files
12+
__pycache__/
13+
*.py[cod]
14+
*$py.class
15+
16+
# C extensions
17+
*.so
18+
19+
# Distribution / packaging
20+
.Python
21+
build/
22+
develop-eggs/
23+
dist/
24+
downloads/
25+
eggs/
26+
.eggs/
27+
lib/
28+
lib64/
29+
parts/
30+
sdist/
31+
var/
32+
wheels/
33+
share/python-wheels/
34+
*.egg-info/
35+
.installed.cfg
36+
*.egg
37+
MANIFEST
38+
39+
# PyInstaller
40+
# Usually these files are written by a python script from a template
41+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
42+
*.manifest
43+
*.spec
44+
45+
# Installer logs
46+
pip-log.txt
47+
pip-delete-this-directory.txt
48+
49+
# Unit test / coverage reports
50+
htmlcov/
51+
.tox/
52+
.nox/
53+
.coverage
54+
.coverage.*
55+
.cache
56+
nosetests.xml
57+
coverage.xml
58+
*.cover
59+
.hypothesis/
60+
.pytest_cache/
61+
62+
# Translations
63+
*.mo
64+
*.pot
65+
66+
# Django stuff:
67+
*.log
68+
local_settings.py
69+
db.sqlite3
70+
71+
# Flask stuff:
72+
instance/
73+
74+
# Scrapy stuff:
75+
.scrapy
76+
77+
# Sphinx documentation
78+
docs/_build/
79+
80+
# PyBuilder
81+
target/
82+
83+
# Jupyter Notebook
84+
.ipynb_checkpoints
85+
86+
# IPython
87+
profile_default/
88+
ipython_config.py
89+
90+
# pyenv
91+
.python-version
92+
93+
# celery beat schedule file
94+
celerybeat-schedule
95+
96+
# SageMath parsed files
97+
*.sage.py
98+
99+
# Environments
100+
.env
101+
.venv
102+
env/
103+
venv/
104+
ENV/
105+
env.bak/
106+
venv.bak/
107+
108+
# Spyder project settings
109+
.spyderproject
110+
.spyproject
111+
112+
# Rope project settings
113+
.ropeproject
114+
115+
# mkdocs documentation
116+
/site
117+
118+
# mypy
119+
.mypy_cache/
120+
.dmypy.json
121+
dmypy.json
122+
123+
# Pyre type checker
124+
.pyre/
125+
126+
### Python ###
127+
# Byte-compiled / optimized / DLL files
128+
129+
# C extensions
130+
131+
# Distribution / packaging
132+
133+
# PyInstaller
134+
# Usually these files are written by a python script from a template
135+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
136+
137+
# Installer logs
138+
139+
# Unit test / coverage reports
140+
141+
# Translations
142+
143+
# Django stuff:
144+
145+
# Flask stuff:
146+
147+
# Scrapy stuff:
148+
149+
# Sphinx documentation
150+
151+
# PyBuilder
152+
153+
# Jupyter Notebook
154+
155+
# IPython
156+
157+
# pyenv
158+
159+
# celery beat schedule file
160+
161+
# SageMath parsed files
162+
163+
# Environments
164+
165+
# Spyder project settings
166+
167+
# Rope project settings
168+
169+
# mkdocs documentation
170+
171+
# mypy
172+
173+
# Pyre type checker
174+
175+
### Python Patch ###
176+
.venv/
177+
178+
### Sonar ###
179+
#Sonar generated dir
180+
/.sonar/
181+
182+
### SonarQube ###
183+
# SonarQube ignore files.
184+
#
185+
# https://docs.sonarqube.org/display/SCAN/Analyzing+with+SonarQube+Scanner
186+
# Sonar Scanner working directories
187+
.sonar/
188+
.scannerwork/
189+
190+
# http://www.sonarlint.org/commandline/
191+
# SonarLint working directories, configuration files (including credentials)
192+
.sonarlint/
193+
194+
# End of https://www.gitignore.io/api/sonar,flask,python,sonarqube

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ PyToMe have a list of website where it wil fetch data(jobs/projects) and will no
4444
[Sanix-darker](https://github.com/sanix-darker)
4545

4646
## Contributors
47-
[Adonis Simo](https://github.com/simo97)
47+
[Adonis Simo (root)](https://github.com/simo97)
4848

4949
## Organization
5050
[Python Cameroon](https://github.com/python-cameroun)

api/example.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
"""
2+
An example on how to use this code.
3+
4+
** How it work **
5+
1 - import available scrapers
6+
2 - import Scraping engine
7+
3 - import default callback method, json_callback_writer()
8+
4 - create an instance of the scraping engine with the 2 default available scraper
9+
5 - run it by calling it run() method and pass to it the utils's method which will be use as callback
10+
11+
normally you will have a scrapper.json file in the current folder with result of scraping
12+
"""
13+
from scraper.scraper import GitHubJobScrapper, PythonJobScrapper
14+
from scraper.engine import ScraperEngine
15+
from scraper.utils import json_callback_writer
16+
17+
app = ScraperEngine([GitHubJobScrapper(), PythonJobScrapper()])
18+
app.run(json_callback_writer)

api/requirements.txt

Lines changed: 0 additions & 4 deletions
This file was deleted.

api/scraper/engine.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from scraper.mixins import PyToMeBaseScrapperMixin
2+
3+
4+
class ScraperEngine:
5+
6+
_data = []
7+
_scrapers = []
8+
"""
9+
Plugin system to load and unload scrapers
10+
"""
11+
def __init__(self, plugins: list=list(), *args, **kwargs):
12+
self._data = []
13+
self._scrapers = []
14+
for scraper in plugins:
15+
if isinstance(scraper, PyToMeBaseScrapperMixin): # only consider class based on PyToMeBaseScrapperMixin
16+
self._scrapers.append(scraper)
17+
18+
def run(self, cb):
19+
scrapers = self._scrapers
20+
scrp_data = []
21+
for scraper in scrapers:
22+
scrp_data += scraper.launch() # run and add result to the same var here
23+
self._data = scrp_data
24+
cb(self._data) # the callback here

api/scraper/example.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
"""
2+
An example on how to use this code.
3+
4+
** How it work **
5+
1 - import available scrapers
6+
2 - import Scraping engine
7+
3 - import default callback method, json_callback_writer()
8+
4 - create an instance of the scraping engine with the 2 default available scraper
9+
5 - run it by calling it run() method and pass to it the utils's method which will be use as callback
10+
11+
normally you will have a scrapper.json file in the current folder with result of scraping
12+
"""
13+
from scraper.scraper import GitHubJobScrapper, PythonJobScrapper
14+
from scraper.engine import ScraperEngine
15+
from scraper.utils import json_callback_writer
16+
17+
app = ScraperEngine([GitHubJobScrapper(), PythonJobScrapper()])
18+
app.run(json_callback_writer)

api/scraper/mixins.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
"""
2+
The base mixin used to implement the different scraper
3+
4+
TODO : Write a complete and working doc
5+
TODO : Write tests
6+
TODO : Let the code follow PEP 8 Coding style
7+
"""
8+
9+
import requests
10+
import shortuuid as shuuid
11+
from bs4 import BeautifulSoup
12+
import time
13+
14+
NO_DESCRIPTION = 'No description provided by the site'
15+
16+
17+
class PyToMeBaseScrapperMixin:
18+
"""
19+
default url : https://pythonjobs.github.io/
20+
"""
21+
22+
base_url = ''
23+
job_block = []
24+
job_data_set = []
25+
26+
def __init__(self, base_url, *args, **kwargs):
27+
if base_url == None:
28+
self.base_url = 'https://pythonjobs.github.io/'
29+
else:
30+
self.base_url = base_url
31+
self.job_block = []
32+
33+
def get_datablocks(self,page_content):
34+
"""
35+
Retrive all the block from the targeted page which represents job info,
36+
result should be on the form of
37+
[
38+
'html content here',
39+
'html content here',
40+
......
41+
] # to be parsed one by one with get_block_content()
42+
"""
43+
raise NotImplementedError('This method has not being implemented, the current class should be use only by inheritance')
44+
45+
def get_block_content(self):
46+
"""
47+
Retrive the content of one block of data, data should be shaped as follow
48+
{
49+
'job_title':'',
50+
'job_description:'',
51+
'job_date':'',
52+
'job_location':'',
53+
'job_details_link':'',
54+
'job_compagny':'',
55+
'job_site':'', # where the job has been scrapped
56+
'job_hash':'' # built by using shortuuuid() lib by combining job_title+job_compagny+job_date str
57+
}
58+
"""
59+
raise NotImplementedError('This method has not being implemented, the current class should be use only by inheritance')
60+
61+
def _log_result(self):
62+
#print(self.job_block)
63+
pass
64+
65+
def launch(self):
66+
self.__init__()
67+
68+
if self.job_block == []:
69+
page_response = requests.get(self.base_url)
70+
page_content = BeautifulSoup(page_response.content, 'html.parser')
71+
self.get_datablocks(page_content)
72+
self.get_block_content()
73+
return self.get_job_dataset()
74+
75+
def get_job_dataset(self):
76+
"""
77+
Just return the current dataset of job fetched or not yet fetched
78+
"""
79+
return self.job_data_set

0 commit comments

Comments
 (0)