-
Notifications
You must be signed in to change notification settings - Fork 166
added feature to alumni and follower count #107
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
"""Example to scrape a list of Companies, and put overviews in csv form""" | ||
|
||
from scrape_linkedin import CompanyScraper | ||
import pandas as pd | ||
|
||
# LIST YOUR COMPANIES HERE | ||
my_company_list = [ | ||
'facebook', 'mit-sloan-school-of-management', 'linkedin', | ||
'harvard-university' | ||
] | ||
|
||
company_data = [] | ||
|
||
with CompanyScraper() as scraper: | ||
# Get each company's overview, add to company_data list | ||
for name in my_company_list: | ||
sc = scraper.scrape(company=name, people=True) | ||
overview = sc.overview | ||
overview['company_name'] = name | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. optional: The overview already has a |
||
overview['people'] = sc.people | ||
company_data.append(overview) | ||
|
||
# Turn into dataframe for easy csv output | ||
df = pd.DataFrame(company_data) | ||
df.to_csv('out.csv', index=False) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,7 +40,8 @@ def get_company_metadata(about_section): | |
elif child.name == 'dd': | ||
content = child.get_text().strip() | ||
results[curr_header].append( | ||
RE_DUPLICATE_WHITESPACE.sub(" ", content)) # strip redundant whitespace | ||
RE_DUPLICATE_WHITESPACE.sub( | ||
" ", content)) # strip redundant whitespace | ||
|
||
for r in results: | ||
results[r] = '\n'.join(results[r]) | ||
|
@@ -58,16 +59,18 @@ def get_employee_count(s: str) -> Optional[int]: | |
class Company(ResultsObject): | ||
"""Linkedin User Profile Object""" | ||
|
||
attributes = ['overview', 'jobs', 'life', 'insights'] | ||
attributes = ['overview', 'jobs', 'life', 'insights', 'people'] | ||
|
||
# KD adds insights attribute | ||
|
||
def __init__(self, overview, jobs, life, insights): | ||
def __init__(self, overview, jobs, life, insights, people): | ||
# KD fixed attributes making jobs and life undefined as they are defined in CompanyScraper, and this allows insights to work | ||
self.overview_soup = BeautifulSoup(overview, 'html.parser') | ||
self.jobs_soup = BeautifulSoup(jobs, 'html.parser') | ||
self.life_soup = BeautifulSoup(life, 'html.parser') | ||
self.insights_soup = BeautifulSoup(insights, 'html.parser') | ||
# KD adds insights soup | ||
self.people_soup = BeautifulSoup(people, 'html.parser') | ||
|
||
@property | ||
def overview(self): | ||
|
@@ -78,12 +81,12 @@ def overview(self): | |
"image": None, | ||
"name": None, | ||
"num_employees": None, | ||
"num_followers": None, | ||
"metadata": None | ||
} | ||
|
||
# Banner containing company Name + Location | ||
banner = one_or_default( | ||
self.overview_soup, '.org-top-card') | ||
banner = one_or_default(self.overview_soup, '.org-top-card') | ||
|
||
# Main container with company overview info | ||
container = one_or_default(self.overview_soup, | ||
|
@@ -92,14 +95,20 @@ def overview(self): | |
overview["name"] = text_or_default(self.overview_soup, "#main h1") | ||
overview['description'] = text_or_default(container, 'section > p') | ||
|
||
logo_image_tag = one_or_default( | ||
banner, '.org-top-card-primary-content__logo') | ||
banner_desp = text_or_default(banner, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I might be missing something, but what is "desp"? Should this be "desc"? |
||
'.org-top-card-summary-info-list') | ||
num_followers = banner_desp.split(" ")[-2].strip() | ||
|
||
overview["num_followers"] = num_followers | ||
|
||
logo_image_tag = one_or_default(banner, | ||
'.org-top-card-primary-content__logo') | ||
overview['image'] = logo_image_tag['src'] if logo_image_tag else '' | ||
|
||
company_metadata = get_company_metadata(container) | ||
overview["metadata"] = company_metadata | ||
overview["num_employees"] = get_employee_count(company_metadata.get( | ||
COMPANY_SIZE_KEY, "")) | ||
overview["num_employees"] = get_employee_count( | ||
company_metadata.get(COMPANY_SIZE_KEY, "")) | ||
|
||
return overview | ||
|
||
|
@@ -116,15 +125,24 @@ def life(self): | |
def insights(self): | ||
|
||
# summary table containing the Insights data for % change in headcount at 6m, 1y and 2y | ||
table = one_or_default( | ||
self.insights_soup, '.org-insights-module__summary-table') | ||
table = one_or_default(self.insights_soup, | ||
'.org-insights-module__summary-table') | ||
|
||
insights = {} | ||
|
||
insights.update(get_info(table, { | ||
'6m change': 'td:nth-of-type(2) span:nth-of-type(3)', | ||
'1y change': 'td:nth-of-type(3) span:nth-of-type(3)', | ||
'2y change': 'td:nth-of-type(4) span:nth-of-type(3)' | ||
|
||
})) | ||
insights.update( | ||
get_info( | ||
table, { | ||
'6m change': 'td:nth-of-type(2) span:nth-of-type(3)', | ||
'1y change': 'td:nth-of-type(3) span:nth-of-type(3)', | ||
'2y change': 'td:nth-of-type(4) span:nth-of-type(3)' | ||
})) | ||
return insights | ||
|
||
@property | ||
def people(self): | ||
content = one_or_default(self.people_soup, | ||
'.org-grid__content-height-enforcer') | ||
people = text_or_default(content, 'div > div > div > h2') | ||
people = people.replace("employees", "").replace("alumni", "").strip() | ||
return people | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All of the other properties return dictionaries of key/value pairs, but this returns a single string. Even if only a single key is currently used, this should return a dictionary for consistency with every other property. It will also more easily allow adding new fields in the future, if appropriate. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
import logging | ||
|
||
import time | ||
from selenium.common.exceptions import TimeoutException | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support import expected_conditions as EC | ||
|
@@ -13,13 +13,30 @@ | |
|
||
|
||
class CompanyScraper(Scraper): | ||
def scrape(self, company, overview=True, jobs=False, life=False, insights=False): | ||
self.url = 'https://www.linkedin.com/company/{}'.format(company) | ||
|
||
def scrape(self, | ||
company, | ||
org_type="company", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think this should be removed as an option for now. |
||
overview=True, | ||
jobs=False, | ||
life=False, | ||
insights=False, | ||
people=False): | ||
|
||
# org_type = "company" or "school" | ||
# This will allow to switch between school and company urls | ||
# The underlying functionality is same for both scrapers | ||
# Added parameters - org_type, people | ||
# people page for company is same as alumni for org_type="school" | ||
# people page for company shows employees data whereas for school it shows alumni data | ||
|
||
self.url = 'https://www.linkedin.com/{org_type}/{company}'.format( | ||
org_type=org_type, company=company) | ||
self.company = company | ||
|
||
self.load_initial() | ||
|
||
jobs_html = life_html = insights_html = overview_html = '' | ||
people_html = jobs_html = life_html = insights_html = overview_html = '' | ||
|
||
if overview: | ||
overview_html = self.fetch_page_html('about') | ||
|
@@ -29,14 +46,27 @@ def scrape(self, company, overview=True, jobs=False, life=False, insights=False) | |
jobs_html = self.fetch_page_html('jobs') | ||
if insights: | ||
insights_html = self.fetch_page_html('insights') | ||
return Company(overview_html, jobs_html, life_html, insights_html) | ||
if people: | ||
people_html = self.fetch_page_html('people') | ||
|
||
return Company(overview_html, jobs_html, life_html, insights_html, | ||
people_html) | ||
|
||
def fetch_page_html(self, page): | ||
""" | ||
Navigates to a company subpage and returns the entire HTML contents of the page. | ||
""" | ||
|
||
if page == "people": | ||
interval = 2.0 | ||
else: | ||
interval = 0.1 | ||
|
||
try: | ||
self.driver.get(f"{self.url}/{page}") | ||
# people/alumni javascript takes more time to load | ||
time.sleep(interval) | ||
|
||
Comment on lines
+60
to
+69
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of a hard-coded sleep, which can cause both unnecessary delays for people with fast internet, and false delays for those with slow internet, I'd instead suggest using a |
||
return self.driver.find_element_by_css_selector( | ||
'.organization-outlet').get_attribute('outerHTML') | ||
except Exception as e: | ||
|
@@ -47,20 +77,22 @@ def fetch_page_html(self, page): | |
def load_initial(self): | ||
self.driver.get(self.url) | ||
try: | ||
myElem = WebDriverWait(self.driver, self.timeout).until(AnyEC( | ||
EC.presence_of_element_located( | ||
(By.CSS_SELECTOR, '.organization-outlet')), | ||
EC.presence_of_element_located( | ||
(By.CSS_SELECTOR, '.error-container')) | ||
)) | ||
myElem = WebDriverWait(self.driver, self.timeout).until( | ||
AnyEC( | ||
EC.presence_of_element_located( | ||
(By.CSS_SELECTOR, '.organization-outlet')), | ||
EC.presence_of_element_located( | ||
(By.CSS_SELECTOR, '.error-container')))) | ||
except TimeoutException as e: | ||
raise ValueError( | ||
"""Took too long to load company. Common problems/solutions: | ||
1. Invalid LI_AT value: ensure that yours is correct (they | ||
update frequently) | ||
2. Slow Internet: increase the timeout parameter in the Scraper constructor""") | ||
2. Slow Internet: increase the timeout parameter in the Scraper constructor""" | ||
) | ||
try: | ||
self.driver.find_element_by_css_selector('.organization-outlet') | ||
except: | ||
raise ValueError( | ||
'Company Unavailable: Company link does not match any companies on LinkedIn') | ||
'Company Unavailable: Company link does not match any companies on LinkedIn' | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
optional: As a naive reader it would be unclear to be what
sc
is supposed to be. I would suggest:company_info
, or similar.