Skip to content

Commit 2e71c78

Browse files
authored
Merge pull request #39 from KnowledgeCaptureAndDiscovery/cli
CLI Branch PR
2 parents 6968d28 + a491bbd commit 2e71c78

File tree

7 files changed

+1423
-0
lines changed

7 files changed

+1423
-0
lines changed

README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,43 @@
11
# SM2KG
22
Software Metadata 2 Knowledge Graphs: A tool for automatically extracting relevant information from readme files
3+
4+
Installation Instructions -
5+
6+
`pip3 install -r requirements.txt`
7+
8+
Create a config.json file using the sample file in the repository.
9+
10+
Command Line Interface -
11+
12+
createJSON.py generates a JSON object after extracting useful information from the github repository. It classifies the readme file into one of four categories - description, invocation, installation, citation depending on highest confidence above a given threshold.
13+
14+
The createJSON.py file takes as input the following parameters:
15+
16+
-r / --repo_url: Link to the github repository for extracting information
17+
18+
-m / --model_path: Path to the pickled models for extraction
19+
20+
-o / --output: Output file name
21+
22+
-t / --threshold: Threshold to classify the content of the readme file
23+
24+
-d / --doc_src: Path of documentation file
25+
26+
27+
cli.py generates a JSON object after extracting useful information from the github repository. It classifies the readme file into one of four categories - description, invocation, installation, citation depending on confidence above a given threshold.
28+
29+
The cli.py file takes as input the following parameters:
30+
31+
-r / --repo_url: Link to the github repository for extracting information
32+
33+
-o / --output: Output file name
34+
35+
-t / --threshold: Threshold to classify the content of the readme file
36+
37+
-d / --doc_src: Path of documentation file
38+
39+
Example:
40+
41+
`python3 createJSON.py -r https://github.com/{owner}/{repository_name} -m ./models/ -o output.json -t 0.5`
42+
43+
`python3 cli.py -r https://github.com/{owner}/{repository_name} -o output.json -t 0.5`

cli.py

Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
# creatJSON.py
2+
# parameters:
3+
## input file: either: url to github repository OR markdown documentation file path
4+
## output file: json with each excerpt marked with all four classification scores
5+
6+
import argparse
7+
import json
8+
import base64
9+
from urllib.parse import urlparse
10+
import sys
11+
import os
12+
from os import path
13+
import requests
14+
from markdown import Markdown
15+
from bs4 import BeautifulSoup
16+
from io import StringIO
17+
import pickle
18+
import pprint
19+
import pandas as pd
20+
import numpy as np
21+
import re
22+
23+
## Markdown to plain text conversion: begin ##
24+
# code snippet from https://stackoverflow.com/a/54923798
25+
def unmark_element(element, stream=None):
26+
if stream is None:
27+
stream = StringIO()
28+
if element.text:
29+
stream.write(element.text)
30+
for sub in element:
31+
unmark_element(sub, stream)
32+
if element.tail:
33+
stream.write(element.tail)
34+
return stream.getvalue()
35+
36+
# patching Markdown
37+
Markdown.output_formats["plain"] = unmark_element
38+
__md = Markdown(output_format="plain")
39+
__md.stripTopLevelTags = False
40+
41+
def unmark(text):
42+
return __md.convert(text)
43+
## Markdown to plain text conversion: end ##
44+
45+
def restricted_float(x):
46+
x = float(x)
47+
if x < 0.0 or x > 1.0:
48+
raise argparse.ArgumentTypeError(f"{x} not in range [0.0, 1.0]")
49+
return x
50+
51+
categories = ['description','citation','installation','invocation']
52+
keep_keys = ('description', 'name', 'owner', 'license', 'languages_url', 'forks_url')
53+
54+
55+
## Function uses the repository_url provided to load required information from github.
56+
## Information kept from the repository is written in keep_keys.
57+
## Returns the readme text and required metadata
58+
def load_repository_metadata(repository_url):
59+
print("Loading Repository Information....")
60+
## load general response of the repository
61+
url = urlparse(repository_url)
62+
if url.netloc != 'github.com':
63+
sys.exit("Error: repository must come from github")
64+
_, owner, repo_name = url.path.split('/')
65+
general_resp = requests.get(f"https://api.github.com/repos/{owner}/{repo_name}", headers=header).json()
66+
67+
if 'message' in general_resp.keys() and general_resp['message']=="Not Found":
68+
sys.exit("Error: repository name is incorrect")
69+
70+
## Remove extraneous data
71+
filtered_resp = {k: general_resp[k] for k in keep_keys}
72+
73+
## Condense owner information
74+
if filtered_resp['owner'] and 'login' in filtered_resp['owner'].keys():
75+
filtered_resp['owner'] = filtered_resp['owner']['login']
76+
77+
## condense license information
78+
license_info = {}
79+
for k in ('name', 'url'):
80+
if filtered_resp['license'] and k in filtered_resp['license'].keys():
81+
license_info[k] = filtered_resp['license'][k]
82+
filtered_resp['license'] = license_info
83+
84+
# get keywords / topics
85+
topics_headers = {}
86+
topics_headers.update(header)
87+
topics_headers = {'accept': 'application/vnd.github.mercy-preview+json'}
88+
topics_resp = requests.get('https://api.github.com/repos/' + owner + "/" + repo_name + '/topics', headers=topics_headers).json()
89+
if topics_resp and 'names' in topics_resp.keys():
90+
filtered_resp['topics'] = topics_resp['names']
91+
92+
## get languages
93+
filtered_resp['languages'] = list(requests.get(filtered_resp['languages_url']).json().keys())
94+
del filtered_resp['languages_url']
95+
96+
## get default README
97+
readme_info = requests.get('https://api.github.com/repos/' + owner + "/" + repo_name + '/readme', headers=topics_headers).json()
98+
readme = base64.b64decode(readme_info['content']).decode("utf-8")
99+
text = unmark(readme)
100+
filtered_resp['readme_url'] = readme_info['html_url']
101+
102+
## get releases
103+
releases_list = requests.get('https://api.github.com/repos/' + owner + "/" + repo_name + '/releases', headers=header).json()
104+
releases_list = map(lambda release : {'tag_name': release['tag_name'], 'name': release['name'], 'author_name': release['author']['login'], 'body': release['body'], 'tarball_url': release['tarball_url'], 'zipball_url': release['zipball_url'], 'html_url':release['html_url'], 'url':release['url']}, releases_list)
105+
filtered_resp['releases'] = list(releases_list)
106+
107+
print("Repository Information Successfully Loaded.")
108+
return text, filtered_resp
109+
110+
## Function takes readme text as input and divides it into excerpts
111+
## Returns the extracted excerpts
112+
def create_excerpts(text):
113+
divisions = text.splitlines()
114+
divisions = [i for i in divisions if i]
115+
return divisions
116+
117+
## Function takes readme text as input and runs the provided classifiers on it
118+
## Returns the dictionary containing scores for each excerpt.
119+
def run_classifiers(text):
120+
score_dict={}
121+
for category in categories:
122+
excerpts = create_excerpts(text)
123+
file_name = file_paths[category]
124+
if file_name=="":
125+
print('I am here')
126+
continue
127+
if not path.exists(file_name):
128+
sys.exit("Error: File/Directory does not exist")
129+
print("Classifying excerpts for the catgory",category)
130+
classifier = pickle.load(open(file_name, 'rb'))
131+
scores = classifier.predict_proba(excerpts)
132+
score_dict[category]={'excerpt': excerpts, 'confidence': scores[:,1]}
133+
print("Excerpt Classification Successful for the Category",category)
134+
return score_dict
135+
136+
## Function takes scores dictionary and a threshold as input
137+
## Returns predictions containing excerpts with a confidence above the given threshold.
138+
def classify(scores, threshold):
139+
print("Checking Thresholds for Excerpt Classification.")
140+
predictions = {}
141+
for ele in scores.keys():
142+
print("Running for",ele)
143+
flag = False
144+
predictions[ele] = []
145+
excerpt=""
146+
confid=[]
147+
for i in range(len(scores[ele]['confidence'])):
148+
if scores[ele]['confidence'][i]>=threshold:
149+
if flag==False:
150+
excerpt=excerpt+scores[ele]['excerpt'][i]+' \n'
151+
confid.append(scores[ele]['confidence'][i])
152+
flag=True
153+
else:
154+
excerpt=excerpt+scores[ele]['excerpt'][i]+' \n'
155+
confid.append(scores[ele]['confidence'][i])
156+
else :
157+
if flag==True:
158+
element = {'excerpt':excerpt,'confidence':confid}
159+
predictions[ele].append(element)
160+
excerpt=""
161+
confid=[]
162+
flag=False
163+
print("Run completed.")
164+
print("All Excerpts below the given Threshold Removed.")
165+
return predictions
166+
167+
## Function takes readme text as input and runs a regex parser on it
168+
## Returns a list of bibtex citations
169+
def extract_bibtex(readme_text):
170+
print("Extracting bibtex citation from readme")
171+
regex = r'\@[a-zA-z]+\{[.\n\S\s]+?[author|title][.\n\S\s]+?[author|title][.\n\S\s]+?\n\}'
172+
excerpts = readme_text
173+
citations = re.findall(regex,excerpts)
174+
print("Extracting bibtex citation from readme completed.")
175+
print(citations)
176+
print(len(citations))
177+
return citations
178+
179+
## Function takes metadata, readme text predictions, bibtex citations and path to the output file
180+
## Performs some combinations and saves the final json Object in the file
181+
def save_json(git_data, repo_data, citations, outfile):
182+
183+
for i in git_data.keys():
184+
if i == 'description':
185+
if 'description' not in repo_data.keys():
186+
repo_data['description'] = []
187+
repo_data['description'].append(git_data[i])
188+
else:
189+
repo_data[i] = git_data[i]
190+
191+
for i in range(len(citations)):
192+
if 'citation' not in repo_data.keys():
193+
repo_data['citation'] = []
194+
repo_data['citation'].append({'excerpt': citations[i]})
195+
196+
print("Saving json data to",outfile)
197+
with open(outfile, 'w') as output:
198+
json.dump(repo_data, output)
199+
200+
header = {}
201+
with open('config.json') as fh:
202+
file_paths = json.load(fh)
203+
header['Authorization'] = file_paths['Authorization']
204+
header['accept'] = 'application/vnd.github.v3+json'
205+
206+
argparser = argparse.ArgumentParser(description="Fetch Github README, split paragraphs, run classifiers and output json containing repository information, classified excerpts and confidence.")
207+
src = argparser.add_mutually_exclusive_group(required=True)
208+
src.add_argument('-r', '--repo_url', help="URL of the Github repository")
209+
src.add_argument('-d', '--doc_src', help='path to documentation file')
210+
argparser.add_argument('-o', '--output', help="path for output json", required=True)
211+
argparser.add_argument('-t','--threshold', help="threshold score", type=restricted_float, default=0.5)
212+
argv = argparser.parse_args()
213+
214+
github_data = {}
215+
if (argv.repo_url):
216+
text, github_data = load_repository_metadata(argv.repo_url)
217+
elif (argv.doc_src):
218+
# Documentation from already downloaded Markdown file.
219+
with open(argv.doc_src, 'r') as doc_fh:
220+
text = unmark(doc_fh.read())
221+
222+
score_dict = run_classifiers(text)
223+
224+
predictions = classify(score_dict, argv.threshold)
225+
226+
citations = extract_bibtex(text)
227+
228+
save_json(github_data, predictions, citations, argv.output)

config.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"Authorization" : "token PersonalAccessToken",
3+
"description" : "./models/description.sk",
4+
"citation" : "./models/citation.sk",
5+
"installation" : "./models/installation.sk",
6+
"invocation" : "./models/invocation.sk"
7+
}

0 commit comments

Comments
 (0)