Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@ temp/
[._]sw[a-p]

# Script output
*.diff
*.diff

# Python pycache
/__pycache__
116 changes: 116 additions & 0 deletions diff_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from typing import List, Optional, Dict, Any
from pr_diff import Pr

class DiffReport:
"""
Compares two PrDiff objects and produces a structured report of differences.
"""

def __init__(self, pr1: Pr, pr2: Pr):
self.pr1 = pr1
self.pr2 = pr2

# Final structured output
self.identical_files: List[str] = []
self.different_files: Dict[str, List[Any]] = {}
self.generate()

def generate(self):
"""Generate the report data structure by comparing two PR diffs."""
pr1_files = {f.file_path: f for f in self.pr1.pr_diff.file_diffs}
pr2_files = {f.file_path: f for f in self.pr2.pr_diff.file_diffs}

all_files = sorted(set(pr1_files.keys()) | set(pr2_files.keys()))

for file_path in all_files:
file1 = pr1_files.get(file_path)
file2 = pr2_files.get(file_path)

if file1 and file2:
if file1.md5 == file2.md5:
self.identical_files.append(file_path)
else:
aligned = self._align_hunks(file1.hunks, file2.hunks)
self.different_files[file_path] = aligned
else:
aligned = []
if file1 and not file2:
aligned = self._align_hunks(file1.hunks, [])
elif file2 and not file1:
aligned = self._align_hunks([], file2.hunks)

self.different_files[file_path] = aligned

def _align_hunks(self, hunks1: List, hunks2: List) -> List[Dict[str, Optional[str]]]:
"""
Align two lists of hunks based on md5 matching.
Returns a list of dicts with left/right hunks aligned.
"""
aligned = []
h1_index, h2_index = 0, 0
total_h1, total_h2 = len(hunks1), len(hunks2)

while h1_index < total_h1 or h2_index < total_h2:
current_h1 = hunks1[h1_index] if h1_index < total_h1 else None
current_h2 = hunks2[h2_index] if h2_index < total_h2 else None

# CASE 1: One list is exhausted → treat remaining as extras
if current_h1 is not None and current_h2 is None:
aligned.append({"left": current_h1, "right": None})
h1_index += 1

elif current_h1 is None and current_h2 is not None:
aligned.append({"left": None, "right": current_h2})
h2_index += 1

# CASE 2: Both lists have hunks
else:
# CASE 2A: Direct match
if current_h1.md5 == current_h2.md5:
aligned.append({"left": current_h1, "right": current_h2})
h1_index += 1
h2_index += 1

# CASE 2B: Look ahead in the right side to find match for current_h1
else:
found_h2_match_index = None
lookahead_index = h2_index + 1

# Scan the rest of the hunks2 list
while found_h2_match_index is None and lookahead_index < total_h2:
if hunks2[lookahead_index].md5 == current_h1.md5:
found_h2_match_index = lookahead_index
else:
lookahead_index += 1

if found_h2_match_index is not None:
# Extra right hunks before the match
for extra_h2 in hunks2[h2_index:found_h2_match_index]:
aligned.append({"left": None, "right": extra_h2})

# Match found
aligned.append({
"left": current_h1,
"right": hunks2[found_h2_match_index]
})

# Update both indexes
h1_index += 1
h2_index = found_h2_match_index + 1

# CASE 2C: No match found at all → extra left hunk
else:
aligned.append({"left": current_h1, "right": None})
h1_index += 1

return aligned


def to_dict(self) -> Dict[str, Any]:
"""Return the final structured report as a dictionary."""
return {
"left_pr": self.pr1,
"right_pr": self.pr2,
"identical_files": self.identical_files,
"different_files": self.different_files,
}
36 changes: 2 additions & 34 deletions gh-compr
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ tag_diff() {
local diff=$1
local prNumber=$2
local taggedDiff

taggedDiff=$(echo -e "$diff" | sed -E "/^(\+\+\+|\-\-\-)/ s|$| # [PR: ${prNumber}]|")

echo "$taggedDiff"
Expand Down Expand Up @@ -52,39 +52,7 @@ else
read -rp "Enter the second PR URL: " prUrl2
fi

output1=$(validate_url_extract_info "$prUrl1")
status=$?
if [ $status -ne 0 ]; then
echo "$output1"
exit 1
fi
output2=$(validate_url_extract_info "$prUrl2")
status=$?
if [ $status -ne 0 ]; then
echo "$output2"
exit 1
fi

read -r owner1 repo1 pr1Number <<< "$output1"
read -r owner2 repo2 pr2Number <<< "$output2"

# Fetch PR diffs using gh CLI
pr1Diff=$(gh pr diff "$pr1Number" -R "$owner1/$repo1")
pr2Diff=$(gh pr diff "$pr2Number" -R "$owner2/$repo2")

# Remove context lines from the diffs
# it would be nice if https://cli.github.com/manual/gh_pr_diff
# had an option to set the number of context lines so this step
# wouldn't be necessary
pr1NoCtxDiff=$(echo "$pr1Diff" | grep -v '^[^+-]')
pr2NoCtxDiff=$(echo "$pr2Diff" | grep -v '^[^+-]')

# Tag headers to make sure filenames always exist in the final diff
pr1TaggedDiff=$(tag_diff "$pr1NoCtxDiff" "$pr1Number")
pr2TaggedDiff=$(tag_diff "$pr2NoCtxDiff" "$pr2Number")

# Generate diff and save to file
diff_output=$(diff -u0 <(echo "$pr1TaggedDiff") <(echo "$pr2TaggedDiff") || true)
diff_output=$(python main.py "$prUrl1" "$prUrl2")
if [ -n "$output" ]; then
echo "$diff_output" > "$output"
echo "Diff saved to $output"
Expand Down
27 changes: 27 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from typing import List
import re
import subprocess
import sys
from pr_diff import Pr, PrDiff, FileDiff, Hunk
from diff_report import DiffReport
from report_formatter import format_pr_diff_report_markdown


def process_input():
url1 = sys.argv[1]
url2 = sys.argv[2]
return url1, url2


def main():
url1, url2 = process_input()
pr1 = Pr(url1)
pr2 = Pr(url2)
diff_report = DiffReport(pr1, pr2)

output = format_pr_diff_report_markdown(diff_report.to_dict())
print(output)


if __name__ == "__main__":
main()
133 changes: 133 additions & 0 deletions pr_diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import subprocess
import re
import hashlib
from typing import List

def compute_md5(data: str) -> str:
return hashlib.md5(data.encode('utf-8')).hexdigest()

class Hunk:
def __init__(self, text: str):
self.text = text.strip()
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if I followed everything correctly, text here begins with a hunk header like: @@ -5,3 +5,6 @@. What happens if two diffs are identical but (for example due do the PRs being based on different branches) are applied on different lines of the files? Would that mistakenly count as a different diff then? Or have I missed something?

Other than that, using the md5 checksum should work fine as a strategy to compare hunks, as long as the two diffs are generated with the same number of lines of context around the changes, which appears to be 3 (non configurable) for gh pr diff.

I'm wondering if we should limit the comparison to the actual changes, similarly to what was happening before:

gh-compr/gh-compr

Lines 79 to 80 in 3785a2d

pr1NoCtxDiff=$(echo "$pr1Diff" | grep -v '^[^+-]')
pr2NoCtxDiff=$(echo "$pr2Diff" | grep -v '^[^+-]')

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are correct - however I'm not sure if we could reliably tell if given diff is the same or not. There certainly are the cases where "left" hunk is the same as the "right" hunk but only line numbers differ - but can we be sure in that case that these should be aligned? I think there are two scenarios:

  1. Some lines were added at the top of "right" hunk so the diff is the same but line numbers differ.
  2. The "right" hunk appears in a completely different place in the file and results in different execution flow.

I'm not sure how to distinguish between these two 🤔

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure either at the moment. But this seems to happen quite often, especially when comparing backports, so we really need to find a solution. Otherwise, many diffs will be marked as different even when they (logically) aren't.

This change makes it easier to identify identical diffs (100% similarity cases). However, I'm worried that false positives would create confusion, since the similarity score would appear "wrong".

The current code might be vulnerable to 2 if I remember correctly. I think - maybe - that if the purpose is to validate that nothing was missed/forgotten between two PR, it could be good enough to check that there are no missing hunks, and trust the developers that they were fit in the right place. WDYT?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree - we can tackle the 2 a bit better. I'm experimenting with the algorithm. I've added hunk similarity comparison based on + / - lines. Here's an example:
Kong/kong#14756 (comment)

What do you think?

We might also add sort of fuzzy matching but I'm not sure if that won't hinder fidelity of the score 🤔

self.md5 = self._compute_md5(self.text)

@staticmethod
def _compute_md5(data: str) -> str:
return hashlib.md5(data.encode('utf-8')).hexdigest()

def __repr__(self):
return f"<Hunk md5={self.md5} length={len(self.text)}>"

def pretty_print(self):
print("\t\t", self)


class FileDiff:
"""Represents the diff for a single file, containing multiple hunks."""

def __init__(self, file_path: str, diff_text: str):
self.file_path = file_path
self.diff_text = diff_text.strip()
self.md5 = compute_md5(self.diff_text)
self.hunks = self._parse_hunks(self.diff_text)

def _parse_hunks(self, text: str) -> List[Hunk]:
"""Extract all hunks from the file diff."""
parts = re.split(r'(?=^@@ )', text, flags=re.MULTILINE)
return [Hunk(part) for part in parts if part.strip().startswith('@@')]

def __repr__(self):
return f"<FileDiff path=['{self.file_path}'] md5=[{self.md5}] hunks=[{len(self.hunks)}]>"

def pretty_print(self):
print("\t", self)
for hunk in self.hunks:
hunk.pretty_print()


class PrDiff:
"""Represents the entire PR diff, containing multiple file diffs."""

def __init__(self, diff_text: str):
self.diff_text = diff_text.strip()
self.md5 = compute_md5(self.diff_text)
self.file_diffs = self._parse_chunks(self.diff_text)


def _parse_chunks(self, diff_text: str) -> List[FileDiff]:
"""Split the PR diff into individual file diffs."""
# Split on "diff --git" lines
raw_chunks = re.split(r'(?=^diff --git)', diff_text, flags=re.MULTILINE)
raw_chunks = [chunk.strip() for chunk in raw_chunks if chunk.strip()]

file_diffs = []
for chunk in raw_chunks:
lines = chunk.splitlines()

# Extract file path from the first line
# Format: diff --git a/path/to/file b/path/to/file
match = re.match(r'^diff --git a/(.+?) b/\1$', lines[0])
if match:
file_path = match.group(1)
else:
# Fallback if exact match fails
file_path = lines[0].split()[2][2:]
Comment on lines +73 to +74
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this an expected scenario? Or should we log something / panic?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, you're right - we should panic if it doesn't match.


file_diffs.append(FileDiff(file_path, chunk))

return file_diffs

def __repr__(self):
return f"<PrDiff md5=[{self.md5}] files=[{len(self.chunks)}]>"

def pretty_print(self):
print(self)
for chunk in self.chunks:
chunk.pretty_print()

class Pr:
def __init__(self, url: str):
owner, repo, pull_number = self._validate_url_extract_info(url)
self.url = url
self.repo = repo
self.number = pull_number

raw_pr_diff = self._download_pr_diff(url)
self.pr_diff = PrDiff(raw_pr_diff)

def __repr__(self):
return f"<Pr number=[{self.number}] diff_md5=[{self.pr_diff.md5}] files=[{len(self.pr_diff.chunks)}]>"

def to_markdown(self):
return f"PR *#{self.number}*, diff md5: *{self.pr_diff.md5}*, files: *{len(self.pr_diff.chunks)}*"


@staticmethod
def _download_pr_diff(pr_url: str):
completed_process = subprocess.run(["gh", "pr", "diff", pr_url], capture_output=True, text=True)
if completed_process.returncode != 0:
print(completed_process.stderr, file=sys.stderr)
raise RuntimeError(f"Could not download pr diff from: {pr_url}")
return completed_process.stdout

@staticmethod
def _validate_url_extract_info(url: str) -> (str, str, str):
"""
Validate a GitHub pull request URL.

:param url: The GitHub PR URL to validate.
:return: True if successful.
:raises ValueError: If the URL does not match the expected format.
"""
regex = r"^https://github\.com/([a-zA-Z0-9-]+)/([a-zA-Z0-9-]+)/pull/([0-9]+)$"
match = re.match(regex, url)

if not match:
raise ValueError(
f"PR URL '{url}' does not match the expected format: "
"'https://github.com/<owner>/<repo>/pull/<number>'"
)

owner, repo, pull_number = match.groups()
return owner, repo, int(pull_number)

Loading