austinoboyle · Lina-Pawar · Oct 16, 2022 · austinoboyle · Oct 26, 2022 · austinoboyle
diff --git a/examples/people-to-csv.py b/examples/people-to-csv.py
@@ -0,0 +1,25 @@
+"""Example to scrape a list of Companies, and put overviews in csv form"""
+
+from scrape_linkedin import CompanyScraper
+import pandas as pd
+
+# LIST YOUR COMPANIES HERE
+my_company_list = [
+    'facebook', 'mit-sloan-school-of-management', 'linkedin',
+    'harvard-university'
+]
+
+company_data = []
+
+with CompanyScraper() as scraper:
+    # Get each company's overview, add to company_data list
+    for name in my_company_list:
+        sc = scraper.scrape(company=name, people=True)
+        overview = sc.overview
+        overview['company_name'] = name
+        overview['people'] = sc.people
+        company_data.append(overview)
+
+# Turn into dataframe for easy csv output
+df = pd.DataFrame(company_data)
+df.to_csv('out.csv', index=False)
diff --git a/scrape_linkedin/Company.py b/scrape_linkedin/Company.py
@@ -40,7 +40,8 @@ def get_company_metadata(about_section):
         elif child.name == 'dd':
             content = child.get_text().strip()
             results[curr_header].append(
-                RE_DUPLICATE_WHITESPACE.sub(" ", content))  # strip redundant whitespace
+                RE_DUPLICATE_WHITESPACE.sub(
+                    " ", content))  # strip redundant whitespace
 
     for r in results:
         results[r] = '\n'.join(results[r])
@@ -58,16 +59,18 @@ def get_employee_count(s: str) -> Optional[int]:
 class Company(ResultsObject):
     """Linkedin User Profile Object"""
 
-    attributes = ['overview', 'jobs', 'life', 'insights']
+    attributes = ['overview', 'jobs', 'life', 'insights', 'people']
+
     # KD adds insights attribute
 
-    def __init__(self, overview, jobs, life, insights):
+    def __init__(self, overview, jobs, life, insights, people):
         # KD fixed attributes making jobs and life undefined as they are defined in CompanyScraper, and this allows insights to work
         self.overview_soup = BeautifulSoup(overview, 'html.parser')
         self.jobs_soup = BeautifulSoup(jobs, 'html.parser')
         self.life_soup = BeautifulSoup(life, 'html.parser')
         self.insights_soup = BeautifulSoup(insights, 'html.parser')
         # KD adds insights soup
+        self.people_soup = BeautifulSoup(people, 'html.parser')
 
     @property
     def overview(self):
@@ -78,12 +81,12 @@ def overview(self):
             "image": None,
             "name": None,
             "num_employees": None,
+            "num_followers": None,
             "metadata": None
         }
 
         # Banner containing company Name + Location
-        banner = one_or_default(
-            self.overview_soup, '.org-top-card')
+        banner = one_or_default(self.overview_soup, '.org-top-card')
 
         # Main container with company overview info
         container = one_or_default(self.overview_soup,
@@ -92,14 +95,20 @@ def overview(self):
         overview["name"] = text_or_default(self.overview_soup, "#main h1")
         overview['description'] = text_or_default(container, 'section > p')
 
-        logo_image_tag = one_or_default(
-            banner, '.org-top-card-primary-content__logo')
+        banner_desp = text_or_default(banner,
+                                      '.org-top-card-summary-info-list')
+        num_followers = banner_desp.split(" ")[-2].strip()
+
+        overview["num_followers"] = num_followers
+
+        logo_image_tag = one_or_default(banner,
+                                        '.org-top-card-primary-content__logo')
         overview['image'] = logo_image_tag['src'] if logo_image_tag else ''
 
         company_metadata = get_company_metadata(container)
         overview["metadata"] = company_metadata
-        overview["num_employees"] = get_employee_count(company_metadata.get(
-            COMPANY_SIZE_KEY, ""))
+        overview["num_employees"] = get_employee_count(
+            company_metadata.get(COMPANY_SIZE_KEY, ""))
 
         return overview
 
@@ -116,15 +125,24 @@ def life(self):
     def insights(self):
 
         # summary table containing the Insights data for % change in headcount at 6m, 1y and 2y
-        table = one_or_default(
-            self.insights_soup, '.org-insights-module__summary-table')
+        table = one_or_default(self.insights_soup,
+                               '.org-insights-module__summary-table')
 
         insights = {}
 
-        insights.update(get_info(table, {
-            '6m change': 'td:nth-of-type(2) span:nth-of-type(3)',
-            '1y change': 'td:nth-of-type(3) span:nth-of-type(3)',
-            '2y change': 'td:nth-of-type(4) span:nth-of-type(3)'
-
-        }))
+        insights.update(
+            get_info(
+                table, {
+                    '6m change': 'td:nth-of-type(2) span:nth-of-type(3)',
+                    '1y change': 'td:nth-of-type(3) span:nth-of-type(3)',
+                    '2y change': 'td:nth-of-type(4) span:nth-of-type(3)'
+                }))
         return insights
+
+    @property
+    def people(self):
+        content = one_or_default(self.people_soup,
+                                 '.org-grid__content-height-enforcer')
+        people = text_or_default(content, 'div > div > div > h2')
+        people = people.replace("employees", "").replace("alumni", "").strip()
+        return people
diff --git a/scrape_linkedin/CompanyScraper.py b/scrape_linkedin/CompanyScraper.py
@@ -1,5 +1,5 @@
 import logging
-
+import time
 from selenium.common.exceptions import TimeoutException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
@@ -13,13 +13,30 @@
 
 
 class CompanyScraper(Scraper):
-    def scrape(self, company, overview=True, jobs=False, life=False, insights=False):
-        self.url = 'https://www.linkedin.com/company/{}'.format(company)
+
+    def scrape(self,
+               company,
+               org_type="company",
+               overview=True,
+               jobs=False,
+               life=False,
+               insights=False,
+               people=False):
+
+        # org_type = "company" or "school"
+        # This will allow to switch between school and company urls
+        # The underlying functionality is same for both scrapers
+        # Added parameters - org_type, people
+        # people page for company is same as alumni for org_type="school"
+        # people page for company shows employees data whereas for school it shows alumni data
+
+        self.url = 'https://www.linkedin.com/{org_type}/{company}'.format(
+            org_type=org_type, company=company)
         self.company = company
 
         self.load_initial()
 
-        jobs_html = life_html = insights_html = overview_html = ''
+        people_html = jobs_html = life_html = insights_html = overview_html = ''
 
         if overview:
             overview_html = self.fetch_page_html('about')
@@ -29,14 +46,27 @@ def scrape(self, company, overview=True, jobs=False, life=False, insights=False)
             jobs_html = self.fetch_page_html('jobs')
         if insights:
             insights_html = self.fetch_page_html('insights')
-        return Company(overview_html, jobs_html, life_html, insights_html)
+        if people:
+            people_html = self.fetch_page_html('people')
+
+        return Company(overview_html, jobs_html, life_html, insights_html,
+                       people_html)
 
     def fetch_page_html(self, page):
         """
         Navigates to a company subpage and returns the entire HTML contents of the page.
         """
+
+        if page == "people":
+            interval = 2.0
+        else:
+            interval = 0.1
+
         try:
             self.driver.get(f"{self.url}/{page}")
+            # people/alumni javascript takes more time to load
+            time.sleep(interval)
+
             return self.driver.find_element_by_css_selector(
                 '.organization-outlet').get_attribute('outerHTML')
         except Exception as e:
@@ -47,20 +77,22 @@ def fetch_page_html(self, page):
     def load_initial(self):
         self.driver.get(self.url)
         try:
-            myElem = WebDriverWait(self.driver, self.timeout).until(AnyEC(
-                EC.presence_of_element_located(
-                    (By.CSS_SELECTOR, '.organization-outlet')),
-                EC.presence_of_element_located(
-                    (By.CSS_SELECTOR, '.error-container'))
-            ))
+            myElem = WebDriverWait(self.driver, self.timeout).until(
+                AnyEC(
+                    EC.presence_of_element_located(
+                        (By.CSS_SELECTOR, '.organization-outlet')),
+                    EC.presence_of_element_located(
+                        (By.CSS_SELECTOR, '.error-container'))))
         except TimeoutException as e:
             raise ValueError(
                 """Took too long to load company.  Common problems/solutions:
                 1. Invalid LI_AT value: ensure that yours is correct (they
                    update frequently)
-                2. Slow Internet: increase the timeout parameter in the Scraper constructor""")
+                2. Slow Internet: increase the timeout parameter in the Scraper constructor"""
+            )
         try:
             self.driver.find_element_by_css_selector('.organization-outlet')
         except:
             raise ValueError(
-                'Company Unavailable: Company link does not match any companies on LinkedIn')
+                'Company Unavailable: Company link does not match any companies on LinkedIn'
+            )