Skip to content

added feature to alumni and follower count #107

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions examples/people-to-csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Example to scrape a list of Companies, and put overviews in csv form"""

from scrape_linkedin import CompanyScraper
import pandas as pd

# LIST YOUR COMPANIES HERE
my_company_list = [
'facebook', 'mit-sloan-school-of-management', 'linkedin',
'harvard-university'
]

company_data = []

with CompanyScraper() as scraper:
# Get each company's overview, add to company_data list
for name in my_company_list:
sc = scraper.scrape(company=name, people=True)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

optional: As a naive reader it would be unclear to be what sc is supposed to be. I would suggest: company_info, or similar.

overview = sc.overview
overview['company_name'] = name
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

optional: The overview already has a name field that seems redundant with this. The ID that is being saved here is more typically referred to as an "id" or "slug", which may be more appropriate field names if you need to save this.

overview['people'] = sc.people
company_data.append(overview)

# Turn into dataframe for easy csv output
df = pd.DataFrame(company_data)
df.to_csv('out.csv', index=False)
52 changes: 35 additions & 17 deletions scrape_linkedin/Company.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def get_company_metadata(about_section):
elif child.name == 'dd':
content = child.get_text().strip()
results[curr_header].append(
RE_DUPLICATE_WHITESPACE.sub(" ", content)) # strip redundant whitespace
RE_DUPLICATE_WHITESPACE.sub(
" ", content)) # strip redundant whitespace

for r in results:
results[r] = '\n'.join(results[r])
Expand All @@ -58,16 +59,18 @@ def get_employee_count(s: str) -> Optional[int]:
class Company(ResultsObject):
"""Linkedin User Profile Object"""

attributes = ['overview', 'jobs', 'life', 'insights']
attributes = ['overview', 'jobs', 'life', 'insights', 'people']

# KD adds insights attribute

def __init__(self, overview, jobs, life, insights):
def __init__(self, overview, jobs, life, insights, people):
# KD fixed attributes making jobs and life undefined as they are defined in CompanyScraper, and this allows insights to work
self.overview_soup = BeautifulSoup(overview, 'html.parser')
self.jobs_soup = BeautifulSoup(jobs, 'html.parser')
self.life_soup = BeautifulSoup(life, 'html.parser')
self.insights_soup = BeautifulSoup(insights, 'html.parser')
# KD adds insights soup
self.people_soup = BeautifulSoup(people, 'html.parser')

@property
def overview(self):
Expand All @@ -78,12 +81,12 @@ def overview(self):
"image": None,
"name": None,
"num_employees": None,
"num_followers": None,
"metadata": None
}

# Banner containing company Name + Location
banner = one_or_default(
self.overview_soup, '.org-top-card')
banner = one_or_default(self.overview_soup, '.org-top-card')

# Main container with company overview info
container = one_or_default(self.overview_soup,
Expand All @@ -92,14 +95,20 @@ def overview(self):
overview["name"] = text_or_default(self.overview_soup, "#main h1")
overview['description'] = text_or_default(container, 'section > p')

logo_image_tag = one_or_default(
banner, '.org-top-card-primary-content__logo')
banner_desp = text_or_default(banner,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might be missing something, but what is "desp"? Should this be "desc"?

'.org-top-card-summary-info-list')
num_followers = banner_desp.split(" ")[-2].strip()

overview["num_followers"] = num_followers

logo_image_tag = one_or_default(banner,
'.org-top-card-primary-content__logo')
overview['image'] = logo_image_tag['src'] if logo_image_tag else ''

company_metadata = get_company_metadata(container)
overview["metadata"] = company_metadata
overview["num_employees"] = get_employee_count(company_metadata.get(
COMPANY_SIZE_KEY, ""))
overview["num_employees"] = get_employee_count(
company_metadata.get(COMPANY_SIZE_KEY, ""))

return overview

Expand All @@ -116,15 +125,24 @@ def life(self):
def insights(self):

# summary table containing the Insights data for % change in headcount at 6m, 1y and 2y
table = one_or_default(
self.insights_soup, '.org-insights-module__summary-table')
table = one_or_default(self.insights_soup,
'.org-insights-module__summary-table')

insights = {}

insights.update(get_info(table, {
'6m change': 'td:nth-of-type(2) span:nth-of-type(3)',
'1y change': 'td:nth-of-type(3) span:nth-of-type(3)',
'2y change': 'td:nth-of-type(4) span:nth-of-type(3)'

}))
insights.update(
get_info(
table, {
'6m change': 'td:nth-of-type(2) span:nth-of-type(3)',
'1y change': 'td:nth-of-type(3) span:nth-of-type(3)',
'2y change': 'td:nth-of-type(4) span:nth-of-type(3)'
}))
return insights

@property
def people(self):
content = one_or_default(self.people_soup,
'.org-grid__content-height-enforcer')
people = text_or_default(content, 'div > div > div > h2')
people = people.replace("employees", "").replace("alumni", "").strip()
return people
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All of the other properties return dictionaries of key/value pairs, but this returns a single string.

Even if only a single key is currently used, this should return a dictionary for consistency with every other property. It will also more easily allow adding new fields in the future, if appropriate.

58 changes: 45 additions & 13 deletions scrape_linkedin/CompanyScraper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging

import time
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Expand All @@ -13,13 +13,30 @@


class CompanyScraper(Scraper):
def scrape(self, company, overview=True, jobs=False, life=False, insights=False):
self.url = 'https://www.linkedin.com/company/{}'.format(company)

def scrape(self,
company,
org_type="company",
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

org_type is used only to generate a URL, but LinkedIn seems to automatically redirect the URL in the case you use /company/school-id. For example, https://www.linkedin.com/company/harvard-university works (otherwise, your example in people-to-csv.py would break.

I think this should be removed as an option for now.

overview=True,
jobs=False,
life=False,
insights=False,
people=False):

# org_type = "company" or "school"
# This will allow to switch between school and company urls
# The underlying functionality is same for both scrapers
# Added parameters - org_type, people
# people page for company is same as alumni for org_type="school"
# people page for company shows employees data whereas for school it shows alumni data

self.url = 'https://www.linkedin.com/{org_type}/{company}'.format(
org_type=org_type, company=company)
self.company = company

self.load_initial()

jobs_html = life_html = insights_html = overview_html = ''
people_html = jobs_html = life_html = insights_html = overview_html = ''

if overview:
overview_html = self.fetch_page_html('about')
Expand All @@ -29,14 +46,27 @@ def scrape(self, company, overview=True, jobs=False, life=False, insights=False)
jobs_html = self.fetch_page_html('jobs')
if insights:
insights_html = self.fetch_page_html('insights')
return Company(overview_html, jobs_html, life_html, insights_html)
if people:
people_html = self.fetch_page_html('people')

return Company(overview_html, jobs_html, life_html, insights_html,
people_html)

def fetch_page_html(self, page):
"""
Navigates to a company subpage and returns the entire HTML contents of the page.
"""

if page == "people":
interval = 2.0
else:
interval = 0.1

try:
self.driver.get(f"{self.url}/{page}")
# people/alumni javascript takes more time to load
time.sleep(interval)

Comment on lines +60 to +69
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of a hard-coded sleep, which can cause both unnecessary delays for people with fast internet, and false delays for those with slow internet, I'd instead suggest using a WebDriverWait(self.driver, self.timeout).until(...), which you can see an example of in load_initial.

return self.driver.find_element_by_css_selector(
'.organization-outlet').get_attribute('outerHTML')
except Exception as e:
Expand All @@ -47,20 +77,22 @@ def fetch_page_html(self, page):
def load_initial(self):
self.driver.get(self.url)
try:
myElem = WebDriverWait(self.driver, self.timeout).until(AnyEC(
EC.presence_of_element_located(
(By.CSS_SELECTOR, '.organization-outlet')),
EC.presence_of_element_located(
(By.CSS_SELECTOR, '.error-container'))
))
myElem = WebDriverWait(self.driver, self.timeout).until(
AnyEC(
EC.presence_of_element_located(
(By.CSS_SELECTOR, '.organization-outlet')),
EC.presence_of_element_located(
(By.CSS_SELECTOR, '.error-container'))))
except TimeoutException as e:
raise ValueError(
"""Took too long to load company. Common problems/solutions:
1. Invalid LI_AT value: ensure that yours is correct (they
update frequently)
2. Slow Internet: increase the timeout parameter in the Scraper constructor""")
2. Slow Internet: increase the timeout parameter in the Scraper constructor"""
)
try:
self.driver.find_element_by_css_selector('.organization-outlet')
except:
raise ValueError(
'Company Unavailable: Company link does not match any companies on LinkedIn')
'Company Unavailable: Company link does not match any companies on LinkedIn'
)