[ADDED]: LinkedIn Profile

Killer2OP · Killer2OP · commit ec813b5b6e77 · 2023-07-22T17:52:30.000+05:30
diff --git a/LinkedIn_Profile_Info/README.md b/LinkedIn_Profile_Info/README.md
@@ -0,0 +1,47 @@
+# LinkedIn Profile Picture
+
+Python package to crawl LinkedIn profile pictures using the Google Custom Search API.
+
+## Overview
+
+This Python package allows you to retrieve LinkedIn profile pictures by providing the profile URL. The package uses the Google Custom Search API to search for the profile pictures associated with the LinkedIn ID.
+
+## Features
+
+- Retrieve LinkedIn profile picture URL using the profile URL.
+- Get additional profile information such as name, headline, and public URL.
+
+## Installation
+
+You can install the package using pip:
+
+```bash
+pip install linkedin-profile-picture
+```
+## Information about Google API
+
+The proper code for google API is in google_API.py
+
+The provided code defines a Python class called GoogleSearchAPI, which is designed to interact with the Google Custom Search API to perform custom searches on Google and retrieve search results related to specific LinkedIn profiles. Let's explain how this code can be used and how it fits into a larger context:
+
+1. GoogleSearchAPI class:
+
+- Initialization (__init__): The GoogleSearchAPI class is initialized with two parameters: key and cx. These parameters represent the API key and custom search engine ID required to access the Google Custom Search API.
+
+- API Request (_hit_api): The _hit_api method is responsible for making requests to the Google Custom Search API. It takes a LinkedIn ID (extracted from the LinkedIn profile URL) as input and constructs a request to search for results related to that ID. The method supports pagination to retrieve multiple pages of search results.
+
+- Response Handling (_create_api_response): The _create_api_response method processes the API response and extracts relevant information from it. If the API response status code is 200, it retrieves search results and stores them in the results list. Otherwise, it stores the error response in the error attribute of an APIResponse object.
+
+2. Usage:
+To use the GoogleSearchAPI class, you would typically do the following steps:
+
+- Import the required modules and create an instance of the GoogleSearchAPI class, providing your API key and custom search engine ID.
+
+Get the LinkedIn profile URL for which you want to find the profile picture. Extract the LinkedIn ID from this URL using the extract_id method of the ProfilePicture class.
+
+- Use the get_profile_picture method of the ProfilePicture class to get the profile picture URL by passing the LinkedIn profile URL as input. This method internally uses the GoogleSearchAPI class to perform the custom search and extract the profile picture URL.
+
+- Optionally, you can use the get_profile_info method of the ProfilePicture class to fetch additional profile information like name, headline, and public URL from the LinkedIn profile.
+
+3. API Rate Limiting:
+The code handles API rate limiting gracefully. If the Google Custom Search API returns a status code of 429 (Too Many Requests), it means the API rate limit has been reached for a particular period. In such cases, the code waits for the number of seconds specified in the "Retry-After" header sent by the API and then retries the API request. This ensures that the code doesn't exceed the API rate limit and avoids getting blocked.
diff --git a/LinkedIn_Profile_Info/google_API.py b/LinkedIn_Profile_Info/google_API.py
@@ -0,0 +1,45 @@
+import re
+import requests
+import logging
+import time
+
+logger = logging.getLogger(__name__)
+
+class GoogleSearchAPI:
+    def __init__(self, key: str, cx: str):
+        self._cx = cx
+        self._key = key
+        self._api_url = "https://www.googleapis.com/customsearch/v1"
+        self._params = {
+            "num": 10,
+            "cx": self._cx,
+            "key": self._key
+        }
+
+    def _hit_api(self, linkedin_id: str) -> list:
+        results = []
+        try:
+            params = self._params.copy()
+            params["exactTerms"] = f"/in/{linkedin_id}"
+            while True:
+                resp = requests.get(self._api_url, params=params)
+                if resp.status_code == 200:
+                    data = resp.json()
+                    items = data.get("items", [])
+                    results.extend(items)
+
+                    next_page = data.get("queries", {}).get("nextPage", [])
+                    if not next_page:
+                        break
+                    params["start"] = next_page[0]["startIndex"]
+                elif resp.status_code == 429:  # API rate limiting
+                    retry_after = int(resp.headers.get("Retry-After", 5))
+                    logger.warning(f"Google Custom Search API rate limit reached. Retrying in {retry_after} seconds.")
+                    time.sleep(retry_after)
+                else:
+                    resp.raise_for_status()  # Raise an exception for other HTTP status codes
+        except requests.exceptions.RequestException as e:
+            logger.exception(f"Error in _hit_api: {e}")
+        except Exception as e:
+            logger.exception("An error occurred while processing the API response.")
+        return results
diff --git a/LinkedIn_Profile_Info/profile.py b/LinkedIn_Profile_Info/profile.py
@@ -0,0 +1,106 @@
+import re
+import requests
+from urllib.parse import urlparse, unquote
+import logging
+
+logger = logging.getLogger(__name__)
+
+class GoogleSearchAPI:
+    def __init__(self, key: str, cx: str):
+        self._cx = cx
+        self._key = key
+        self._api_url = "https://www.googleapis.com/customsearch/v1"
+        self._params = {
+            "num": 10,
+            "cx": self._cx,
+            "key": self._key
+        }
+
+    def _hit_api(self, linkedin_id: str) -> list:
+        results = []
+        try:
+            params = self._params.copy()
+            params["exactTerms"] = f"/in/{linkedin_id}"
+            while True:
+                resp = requests.get(self._api_url, params=params)
+                if resp.status_code != 200:
+                    logger.warning(f"Google Custom Search API error: {resp.status_code} - {resp.text}")
+                    break
+
+                data = resp.json()
+                items = data.get("items", [])
+                results.extend(items)
+
+                next_page = data.get("queries", {}).get("nextPage", [])
+                if not next_page:
+                    break
+                params["start"] = next_page[0]["startIndex"]
+        except Exception as e:
+            logger.exception("Error in _hit_api:")
+        return results
+
+class ProfilePicture:
+    def __init__(self, key: str, cx: str):
+        self._api_obj = GoogleSearchAPI(key, cx)
+
+    def extract_id(self, link: str) -> str:
+        """ To get a clean LinkedIn ID  """
+        linkedin_id = link
+        match = re.findall(r'\/in\/([^\/]+)\/?', urlparse(link).path)
+        if match:
+            linkedin_id = match[0].strip()
+        linkedin_id = linkedin_id.strip("/")
+        linkedin_id = unquote(linkedin_id)
+        return linkedin_id
+
+    def _check_picture_url(self, link: str) -> bool:
+        match = re.search(r"(media-exp\d\.licdn\.com).+?(profile-displayphoto-shrink_)", link)
+        return bool(match)
+
+    def _check_url_exists(self, link: str) -> bool:
+        try:
+            resp = requests.head(link, timeout=5)
+            return resp.status_code == 200
+        except requests.RequestException:
+            return False
+
+    def _extract_profile_picture(self, linkedin_id: str, res: list) -> str:
+        link = ""
+        for item in res:
+            linkedin_url = item.get("link", "")
+            search_id = self.extract_id(linkedin_url)
+            if search_id == linkedin_id:
+                metatags = item.get("pagemap", {}).get("metatags", [])
+                metatags = [tag.get("og:image") for tag in metatags if "og:image" in tag]
+
+                for url in metatags:
+                    if self._check_picture_url(url) and self._check_url_exists(url):
+                        link = url
+                        break
+            if link:
+                break
+        return link
+
+    def _extract_profile_info(self, linkedin_id: str, res: list) -> dict:
+        info = {}
+        for item in res:
+            linkedin_url = item.get("link", "")
+            search_id = self.extract_id(linkedin_url)
+            if search_id == linkedin_id:
+                info["name"] = item.get("title")
+                info["headline"] = item.get("snippet")
+                info["public_url"] = linkedin_url
+                break
+        return info
+
+    def get_profile_picture(self, link: str) -> str:
+        linkedin_id = self.extract_id(link)
+        api_resp = self._api_obj._hit_api(linkedin_id)
+        profile_picture_url = self._extract_profile_picture(linkedin_id, api_resp)
+        return profile_picture_url
+
+    def get_profile_info(self, link: str) -> dict:
+        linkedin_id = self.extract_id(link)
+        api_resp = self._api_obj._hit_api(linkedin_id)
+        profile_info = self._extract_profile_info(linkedin_id, api_resp)
+        return profile_info
diff --git a/LinkedIn_Profile_Info/requirements.txt b/LinkedIn_Profile_Info/requirements.txt
@@ -0,0 +1 @@
+requests>=2.26.0