Skip to content

Commit 8588d50

Browse files
committed
Initial code for patchparser
1 parent f2c68c3 commit 8588d50

File tree

6 files changed

+334
-3
lines changed

6 files changed

+334
-3
lines changed

README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
# PatchParser
2-
A python package for extracting features from a patch
2+
A python package to extract key features from a commit patch.
3+
4+
***Please note this repository is still in the initial development phase.***
35

46
## Features
57

68
|Columns |Type|Description |
79
|--------------------|----|-------------------------------------------------------------------------------------------|
8-
|repo |str |Repository link |
10+
|repo_owner |str |Repository Owner |
11+
|repo_name |str |Repository Name |
912
|sha |str |Target Commit SHA |
1013
|message |str |Associated commit message |
1114
|file_name |str |Name of file altered in patch |
@@ -17,7 +20,7 @@ A python package for extracting features from a patch
1720
|total_patches |int |Total number of patches per file |
1821
|raw_patch_header |str |Header of the patch (@@ -A,X +B,Y @@) |
1922
|raw_patch |str |The raw patch for a single patch |
20-
|original_code |str |The lef side (parent commit state) of the git diff in GitHub. Raw code. -'s are stripped. |
23+
|original_code |str |The left side (parent commit state) of the git diff in GitHub. Raw code. -'s are stripped. |
2124
|original_line_start |int |Original line start number (@@ -**A**,X +B,Y @@) |
2225
|original_line_length|int |Original line end (@@ -A,**X** +B,Y @@) |
2326
|original_line_end |int |Original_line_start + original_line_length |

pyproject.toml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
[build-system]
2+
# These are the assumed default build requirements from pip:
3+
# https://pip.pypa.io/en/stable/reference/pip/#pep-517-and-518-support
4+
requires = ["setuptools>=61.0", "wheel"]
5+
build-backend = "setuptools.build_meta"
6+
7+
[project]
8+
name = "patchparser"
9+
version = "0.0.1"
10+
authors = [
11+
{ name="Trevor Dunlap", email="[email protected]" },
12+
]
13+
description = "A python package to extract key features from a commit patch."
14+
readme = "README.md"
15+
license = { file="LICENSE" }
16+
requires-python = ">=3.7"
17+
classifiers = [
18+
"Programming Language :: Python :: 3",
19+
"License :: OSI Approved :: MIT License",
20+
"Operating System :: OS Independent",
21+
]
22+
23+
[project.urls]
24+
"Homepage" = "https://github.com/tdunlap607/patchparser"
25+
"Bug Tracker" = "https://github.com/tdunlap607/patchparser/issues"

src/patchparser/__init__.py

Whitespace-only changes.

src/patchparser/github_parser.py

Lines changed: 281 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,281 @@
1+
"""
2+
Helper functions to load and process commit data from GitHub
3+
"""
4+
import re
5+
import requests
6+
7+
8+
class CommitParse:
9+
def __init__(self, repo_owner: str, repo_name: bool, sha: str) -> object:
10+
"""Initialize a class to hold the data for parsing the commit data
11+
12+
Args:
13+
repo_owner (str): Repo owner
14+
repo_name (str): Repo name
15+
sha (str): Target commit SHA
16+
17+
Returns:
18+
object: CommitParse
19+
"""
20+
self.repo_owner = repo_owner
21+
self.repo_name = repo_name
22+
self.sha = sha
23+
self.message = None
24+
self.file_name = None
25+
self.file_number = None
26+
self.file_extension = None
27+
self.total_files_changed = None
28+
self.raw_file_patch = None
29+
self.patch_number = None
30+
self.total_patches = None
31+
self.raw_patch_header = None
32+
self.raw_patch = None
33+
self.original_code = None
34+
self.original_line_start = None
35+
self.original_line_length = None
36+
self.original_line_end = None
37+
self.modified_code = None
38+
self.modified_line_start = None
39+
self.modified_line_length = None
40+
self.modified_line_end = None
41+
self.additions = None
42+
self.added_code = None
43+
self.deletions = None
44+
self.deleted_code = None
45+
self.changes = None
46+
self.status = None
47+
self.total_file_additions = None
48+
self.total_file_deletions = None
49+
self.total_file_changes = None
50+
51+
52+
def parse_commit_info(commit_info: list, parsed_commit: CommitParse) -> list:
53+
"""Parses the commit_info list
54+
55+
Args:
56+
commit_info (list): commit_info list from original data
57+
parsed_commit (CommitParse): Set CommitParse class with basic info
58+
59+
Returns:
60+
list: List of dictionaries with desired data for project
61+
"""
62+
63+
"""Master list to hold information"""
64+
data = []
65+
66+
total_files_changed = len(commit_info)
67+
68+
"""
69+
Enumerate through each row withiin commit_info.
70+
A row represents changed files in the commit
71+
"""
72+
for index, row in enumerate(commit_info):
73+
file_name = row["filename"]
74+
file_number = index
75+
file_extension = file_name.split(".")[-1]
76+
raw_file_patch = row["patch"]
77+
status = row["status"]
78+
total_file_additions = row["additions"]
79+
total_file_deletions = row["deletions"]
80+
total_file_changes = row["changes"]
81+
82+
"""Patches are None in some instances (e.g., XLSX files)"""
83+
if raw_file_patch is not None:
84+
"""Find patch headers (e.g., @@ @@)"""
85+
headers_search = re.findall(r"@@(.*?)@@", raw_file_patch)
86+
87+
"""Cleaning the headers, found @@REPLACE_ME@@ in some random code"""
88+
headers = []
89+
for head_row in headers_search:
90+
if '-' in head_row and '+' in head_row:
91+
headers.append(f"@@{head_row}@@")
92+
total_patches = len(headers)
93+
94+
for index, header in enumerate(headers):
95+
patch_number = index
96+
"""Get line numbers changed for original code"""
97+
original_lines = re.search(f"@@ -(.*?) \+", header).group(1)
98+
if "," in original_lines:
99+
original_line_start = int(original_lines.split(",")[0])
100+
original_line_length = int(original_lines.split(",")[1])
101+
else:
102+
"""This occus for added txt files where the total length is 1: appears as @@ -A -B @@"""
103+
original_line_start = int(original_lines)
104+
original_line_length = int(original_lines)
105+
original_line_end = original_line_start + original_line_length - 1
106+
107+
"""Get line numbers changed for modified code"""
108+
modified_lines = re.search(f" \+(.*) @@", header).group(1)
109+
if "," in modified_lines:
110+
modified_line_start = int(modified_lines.split(",")[0])
111+
modified_line_length = int(modified_lines.split(",")[1])
112+
else:
113+
"""This occurs for added binary files the header will appear as @@ -A,X -B @@"""
114+
modified_line_start = int(modified_lines)
115+
modified_line_length = int(modified_lines)
116+
117+
modified_line_end = modified_line_start + modified_line_length - 1
118+
119+
"""Check if length of index is equal to last patch, if so read to end of raw_patch"""
120+
if index + 1 == len(headers):
121+
raw_patch = raw_file_patch[raw_file_patch.find(headers[index])+len(headers[index]):]
122+
else:
123+
raw_patch = raw_file_patch[raw_file_patch.find(headers[index])+len(headers[index]):raw_file_patch.find(headers[index+1])]
124+
125+
126+
"""Call the function to help parse the patch to get data"""
127+
patch_parse = parse_raw_patch(raw_patch)
128+
129+
"""Create a temporary class to hold the parsed patch data"""
130+
temp_parsed_commit = CommitParse(parsed_commit.repo_owner,
131+
parsed_commit.repo_name,
132+
parsed_commit.sha)
133+
134+
"""Set various values"""
135+
temp_parsed_commit.message = parsed_commit.message
136+
temp_parsed_commit.file_name = file_name
137+
temp_parsed_commit.file_number = file_number
138+
temp_parsed_commit.file_extension = file_extension
139+
temp_parsed_commit.total_files_changed = total_files_changed
140+
temp_parsed_commit.raw_file_patch = raw_file_patch
141+
temp_parsed_commit.patch_number = patch_number
142+
temp_parsed_commit.total_patches = total_patches
143+
temp_parsed_commit.raw_patch_header = header
144+
temp_parsed_commit.raw_patch = raw_patch
145+
temp_parsed_commit.original_code = patch_parse["original_code"]
146+
temp_parsed_commit.original_line_start = original_line_start
147+
temp_parsed_commit.original_line_length = original_line_length
148+
temp_parsed_commit.original_line_end = original_line_end
149+
temp_parsed_commit.modified_code = patch_parse["modified_code"]
150+
temp_parsed_commit.modified_line_start = modified_line_start
151+
temp_parsed_commit.modified_line_length = modified_line_length
152+
temp_parsed_commit.modified_line_end = modified_line_end
153+
temp_parsed_commit.additions = patch_parse["additions"]
154+
temp_parsed_commit.added_code = patch_parse["added_code"]
155+
temp_parsed_commit.deletions = patch_parse["deletions"]
156+
temp_parsed_commit.deleted_code = patch_parse["deleted_code"]
157+
temp_parsed_commit.changes = patch_parse["changes"]
158+
temp_parsed_commit.status = status
159+
temp_parsed_commit.total_file_additions = total_file_additions
160+
temp_parsed_commit.total_file_deletions = total_file_deletions
161+
temp_parsed_commit.total_file_changes = total_file_changes
162+
163+
"""Append the class as a dictionary to the data list"""
164+
data.append(temp_parsed_commit.__dict__)
165+
else:
166+
"""Sometimes patch is None (e.g., XLSX files)"""
167+
temp_parsed_commit = CommitParse(parsed_commit.repo_owner,
168+
parsed_commit.repo_name,
169+
parsed_commit.sha)
170+
171+
temp_parsed_commit.message = parsed_commit.message
172+
temp_parsed_commit.file_name = file_name
173+
temp_parsed_commit.file_number = file_number
174+
temp_parsed_commit.file_extension = file_extension
175+
temp_parsed_commit.total_files_changed = total_files_changed
176+
temp_parsed_commit.raw_file_patch = raw_file_patch
177+
temp_parsed_commit.status = status
178+
temp_parsed_commit.total_file_additions = total_file_additions
179+
temp_parsed_commit.total_file_deletions = total_file_deletions
180+
temp_parsed_commit.total_file_changes = total_file_changes
181+
182+
"""Append the class as a dictionary to the data list"""
183+
data.append(temp_parsed_commit.__dict__)
184+
185+
return data
186+
187+
188+
def parse_raw_patch(temp_raw_patch: str) -> dict:
189+
"""Parses a single raw patch into original code and modified code
190+
191+
Args:
192+
temp_raw_patch (str): Raw string of a single patch
193+
194+
Returns:
195+
dict: Simple dictionary with various key values for parsing the raw patch
196+
"""
197+
198+
"""Split the code so we can parse line by line"""
199+
split_code = temp_raw_patch.splitlines()
200+
201+
"""Create placeholders for desired values"""
202+
original_code = []
203+
modified_code = []
204+
205+
additions = 0
206+
added_code = []
207+
deletions = 0
208+
deleted_code = []
209+
210+
"""Loop through each line of code to parse it"""
211+
for line in split_code:
212+
"""[1:] is due to the spaces added from the git diff for +/- indicators in str"""
213+
if line.startswith("-"):
214+
"""- signs indicate original code"""
215+
original_code.append(line[1:])
216+
deleted_code.append(line[1:])
217+
deletions += 1
218+
elif line.startswith("+"):
219+
"""+ signs indicate modified code"""
220+
modified_code.append(line[1:])
221+
added_code.append(line[1:])
222+
additions += 1
223+
else:
224+
"""Add any unchanged lines to original/modified code"""
225+
original_code.append(line[1:])
226+
modified_code.append(line[1:])
227+
228+
original_code_str = "\n".join(original_code)
229+
modified_code_str = "\n".join(modified_code)
230+
added_code_str = "\n".join(added_code)
231+
deleted_code_str = "\n".join(deleted_code)
232+
changes = additions + deletions
233+
234+
"""Create a simple patch to return"""
235+
patch_parse = dict(
236+
original_code = original_code_str,
237+
modified_code = modified_code_str,
238+
additions = additions,
239+
added_code = added_code_str,
240+
deletions = deletions,
241+
deleted_code = deleted_code_str,
242+
changes = changes
243+
)
244+
245+
return patch_parse
246+
247+
248+
def commit(repo_owner: str, repo_name: str, sha: str, verbose=False) -> list:
249+
"""Pass the GitHub repo_owner, repo_name, and associated commit to parse.
250+
251+
Args:
252+
repo_owner (str): Target repo owner
253+
repo_name (str): Target repo name
254+
commit_sha (str): Target commit SHA from GitHub
255+
256+
Returns:
257+
list: List of dictionaries strcutred around the class CommitParse
258+
"""
259+
260+
"""Commit info API URL"""
261+
url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/commits/{sha}"
262+
263+
"""Get the response"""
264+
response = requests.get(url)
265+
response.close()
266+
267+
"""Convert to json"""
268+
commit_info = response.json()
269+
270+
"""Initialize a CommitParse to hold data"""
271+
parsed_commit = CommitParse(repo_owner=repo_owner,
272+
repo_name=repo_name,
273+
sha=commit_info["sha"])
274+
275+
"""Add commit message"""
276+
parsed_commit.message = commit_info["commit"]["message"]
277+
278+
"""Parse the files"""
279+
parsed_files = parse_commit_info(commit_info["files"], parsed_commit)
280+
281+
return parsed_files

tests/__init__.py

Whitespace-only changes.

tests/test_github_parser.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import unittest
2+
from patchparser import github_parser as gp
3+
4+
5+
class TestGitHubPatchParser(unittest.TestCase):
6+
7+
def test_commit(self):
8+
"""
9+
Testing GitHub Parser for a given commit
10+
Associated CVE: https://nvd.nist.gov/vuln/detail/CVE-2021-4118
11+
Example commit: https://github.com/Lightning-AI/lightning/commit/62f1e82e032eb16565e676d39e0db0cac7e34ace
12+
"""
13+
parsed = gp.commit(repo_owner="Lightning-AI",
14+
repo_name="lightning",
15+
sha="62f1e82e032eb16565e676d39e0db0cac7e34ace")
16+
17+
"""Expecting 5 changes from the above commit"""
18+
self.assertEqual(len(parsed), 5)
19+
20+
21+
if __name__ == '__main__':
22+
unittest.main()

0 commit comments

Comments
 (0)