scikit-image · grlee77 · Jul 29, 2020 · Jul 30, 2020 · Sep 30, 2020
diff --git a/skimage_filter_dependents.py b/skimage_filter_dependents.py
@@ -0,0 +1,346 @@
+"""
+At the time this script was created (July 2020), GitHub did not offer an
+official way to query the dependent packages through their API. So, we instead
+use a web-scraping approach via BeautifulSoup, patterned after a response
+in this stack-overflow thread:
+https://stackoverflow.com/questions/58734176/how-to-use-github-api-to-get-a-repositorys-dependents-information-in-github
+
+To retrieve topic lists via the GitHub API, the user must have defined a
+GITHUB_TOKEN environment variable.
+
+This script generates three lists of packages:
+
+1.) One that has ALL dependents that are active repositories (i.e. no "Ghost"
+icon in the web page).
+2.) One list that only retains packages with >= min_stars stars, but also
+includes a list of the GitHub "topics" associated with each package.
+3.) A third list that is based on filtering the second list. During filtering,
+a package is retained if either:
+    a.) Any string from repo_name_terms is in the repository organization/name
+    b.) A topic in the repo's topic lists matches a topic in topic_search_terms
+
+The three variables containing the lists described above are:
+
+Outputs
+-------
+all_packages : list of tuple
+    Each element is a (name, forks, stars) tuple.
+popular_packages : list of tuple
+    Each element is a (name, forks, stars, topics) tuple.
+popular_filtered_packages : list, tuple
+    Each element is a (name, forks, stars, topics) tuple.
+"""
+
+import os
+import pickle
+
+from bs4 import BeautifulSoup
+from github import Github
+import pandas
+import requests
+
+# we use PyGitHub to retrieve topic lists
+token = os.environ['GITHUB_TOKEN']
+g = Github(token)
+
+# ----------------------------------
+# START OF USER-CONFIGURABLE OPTIONS
+# ----------------------------------
+
+# The repository we will query for it's dependents
+repo_to_query = "scikit-image/scikit-image"
+
+# Retrieve detailed topic lists only for the packages with >= min_stars stars.
+min_stars = 5
+
+# If True, will write the three lists to .pickle files in the current directory
+save_to_pickle = False
+# If True, will write the three lists to .csv files in the current directory
+save_to_csv = True
+
+# Search terms of interest in the repository organization/name.
+# (see description at top)
+# All terms should be in lower case.
+repo_name_terms = [
+    'brain',
+    'cell',
+    'ecg',
+    'eeg',
+    'medi',
+    'mri',
+    'neuro',
+    'pathol',
+    'retin',
+    'slide',
+    'spectro',
+    'tissue',
+    'tomo',
+]
+
+# Search terms of interest in the repository's topics (see description at top).
+# This list was created to match bio-image applications by manually curating
+# topic names from the full list of packages.
+topic_search_terms = [
+    'airways',
+    'anatomy',
+    'arteries',
+    'astrocytes',
+    'atomic-force-microscopy',
+    'afm',
+    'axon',
+    'bioimage-informatics',
+    'bioinformatics',
+    'biologists',
+    'biomedical-image-processing',
+    'bionic-vision',
+    'biophysics',
+    'brain-connectivity',
+    'brain-imaging',
+    'brain-mri',
+    'brain-tumor-segmentation',
+    'brats',
+    'calcium',
+    'cancer-research',
+    'cell-biology',
+    'cell-detection',
+    'cell-segmentation',
+    'computational-pathology',
+    'connectome',
+    'connectomics',
+    'cryo-em',
+    'ct-data',
+    'deconvolution-microscopy',
+    'dicom',
+    'dicom-rt',
+    'digital-pathology-data',
+    'digital-pathology',
+    'digital-slide-archive',
+    'dmri',
+    'electron-microscopy',
+    'electrophysiology',
+    'fluorescence',
+    'fluorescence-microscopy-imaging',
+    'fmri',
+    'fmri-preprocessing',
+    'functional-connectomes',
+    'healthcare-imaging',
+    'histology',
+    'voxel',
+    'microorganism-colonies',
+    'microscopy',
+    'microscopy-images',
+    'neuroimaging',
+    'medical',
+    'medical-image-computing',
+    'medical-image-processing',
+    'medical-images',
+    'medical-imaging',
+    'mri',
+    'myelin',
+    'neural-engineering',
+    'neuroanatomy',
+    'neuroimaging',
+    'neuroimaging-analysis',
+    'neuropoly',
+    'neuroscience',
+    'nih-brain-initiative',
+    'openslide',
+    'pathology',
+    'pathology-image',
+    'radiation-oncology',
+    'radiation-physics',
+    'raman',
+    'retinal-implants',
+    'scanning-probe-microscopy',
+    'scanning-tunnelling-microscopy',
+    'single-cell-imaging',
+    'slide-images',
+    'spectroscopy',
+    'spinalcord',
+    'stm',
+    'stem',
+    'stitching',
+    'structural-connectomes',
+    'tissue-localization',
+    'tomography',
+    'volumetric-images',
+    'whole-slide-image',
+    'whole-slide-imaging',
+]
+
+# Omit the following repositories from the filtered list.
+# These match at least one of the search terms above, but do not appear to be
+# biology-focused. (e.g. the term "cell" appears in "Marcello").
+omit_list = [
+    'Marcello-Sega/pytim',
+    'PMEAL/porespy'
+]
+
+# --------------------------------
+# END OF USER-CONFIGURABLE OPTIONS
+# --------------------------------
+
+# Parse at most this many web pages.
+# Parsing should automatically stop when reaching the last page.
+max_page_num = 100
+
+packages = True
+url = ('https://github.com/{}/network/dependents'
+       '?dependent_type=PACKAGE').format(repo_to_query)
+
+package_list = []
+ghost_list = []
+prev_len = 0
+for i in range(max_page_num):
+    # retrieve HTML for the current URL
+    print("GET " + url)
+    r = requests.get(url)
+    soup = BeautifulSoup(r.content, "html.parser")
+
+    page_package_list = []
+    page_ghost_list = []
+    for t in soup.findAll("div", {"class": "Box-row"}):
+        try:
+            # find repository org/name
+            name = "{}/{}".format(
+                t.find('a', {"data-repository-hovercards-enabled": ""}).text,
+                t.find('a', {"data-hovercard-type": "repository"}).text
+            )
+        except AttributeError:
+            # Ghost repositories will give None for the find() calls above.
+            # This results in an AttributeError when trying to access .text
+            page_ghost_list.append(t.text)
+            continue
+
+        # extract the number of stars
+        stars = 'unknown'
+        for span in t.find_all('span', attrs={'class': 'text-gray-light'}):
+            svg_star = span.find_all('svg', attrs={'class': 'octicon-star'})
+            if svg_star:
+                # replace ","" in e.g. "1,000" before casting to int
+                stars = int(span.text.strip().replace(",", ""))
+                break
+
+        # extract the number of forks
+        forks = 'unknown'
+        for span in t.find_all('span', attrs={'class': 'text-gray-light'}):
+            svg_fork = span.find_all('svg',
+                                     attrs={'class': 'octicon-repo-forked'})
+            if svg_fork:
+                # replace ","" in e.g. "1,000" before casting to int
+                forks = int(span.text.strip().replace(",", ""))
+                break
+
+        page_package_list.append((name, forks, stars))
+
+    # append packages from the current page to the overall lists
+    package_list = package_list + page_package_list
+    ghost_list = ghost_list + page_ghost_list
+
+    # remove any duplicates
+    package_list = list(set(package_list))
+    ghost_list = list(set(ghost_list))
+
+    # terminate if no change from the prior URL
+    new_len = len(package_list) + len(ghost_list)
+    if new_len == prev_len:
+        print("no change in package lists... stopping scraping")
+        break
+    prev_len = new_len
+
+    # find the URL for the "Next" page of packages
+    paginationContainers = soup.find(
+        "div", {"class": "paginate-container"}).find_all('a')
+    url = None
+    for paginationContainer in paginationContainers:
+        # Make sure we are retrieving the "Next" page and not the "Previous"
+        if paginationContainer.text == "Next":
+            url = paginationContainer["href"]
+    if url is None:
+        print("No additional next page found, ... stopping scraping")
+        break
+
+# sort by descending number of stars
+# This is the first list mentioned at the top.
+all_packages = sorted(package_list, key=lambda x: x[2], reverse=True)
+
+# Create the second list by retaining only those with >= min_stars
+# Note that in the package list, the tuple is:
+#   (name, # of forks, # of stars)
+_popular_packages = [p for p in all_packages if p[2] >= min_stars]
+n_popular = len(_popular_packages)
+
+# add a 4th term to each tuple, containing the GitHub topic list
+popular_packages = []
+
+for n, p in enumerate(_popular_packages):
+    print("Retrieving topics for package {} of {}".format(n + 1, n_popular))
+    repo_name = p[0]
+    repo = g.get_repo(repo_name)
+    topics = repo.get_topics()
+    popular_packages.append(p + (topics,))
+
+print("Applying filtering")
+popular_filtered_packages = []
+for p in popular_packages:
+    name = p[0]
+    name_lower = name.lower()
+    if name in omit_list:
+        continue
+    topics = p[3]
+    keep = False  # unless we match a term below, we will exclude the package
+
+    # check match based on repository organization/name
+    for m in repo_name_terms:
+        if m in name_lower:
+            keep = True
+            break
+
+    # If not already a match, search based on topic search terms
+    if not keep:
+        for topic in topics:
+            if topic in topic_search_terms:
+                keep = True
+                break
+    if keep:
+        popular_filtered_packages.append(p)
+
+# dump output lists to pickle
+fname_base = repo_to_query.replace('/', '_')
+if save_to_pickle:
+    print("Writing pickle files")
+
+    os.chdir('/media/lee8rx/data/Dropbox/Dropbox/Grants/CZI')
+    with open(fname_base + '_all_packages.pickle', 'wb') as f:
+        pickle.dump(all_packages, f)
+
+    with open(fname_base + '_popular_packages.pickle', 'wb') as f:
+        pickle.dump(popular_packages, f)
+
+    with open(fname_base + '_popular_filtered_packages.pickle', 'wb') as f:
+        pickle.dump(popular_filtered_packages, f)
+
+if save_to_csv:
+    print("Writing CSV files")
+    df_all = pandas.DataFrame(
+        all_packages,
+        columns=('name', '# of forks', '# of stars')
+    )
+    df_all = df_all.set_index('name')
+    df_all.to_csv(fname_base + '_all_dependents.csv')
+
+    df_popular = pandas.DataFrame(
+        popular_packages,
+        columns=('name', '# of forks', '# of stars', 'topics')
+    )
+    df_popular = df_popular.set_index('name')
+    df_popular.to_csv(fname_base + '_popular_dependents.csv')
+
+    df_filtered_popular = pandas.DataFrame(
+        popular_filtered_packages,
+        columns=('name', '# of forks', '# of stars', 'topics')
+    )
+    df_filtered_popular = df_filtered_popular.set_index('name')
+    df_filtered_popular.to_csv(fname_base + '_filtered_dependents.csv')
+
+    # print(df_filtered_popular.to_markdown())