samugi · nowNick · Sep 11, 2025 · samugi · Sep 16, 2025 · nowNick
diff --git a/.gitignore b/.gitignore
@@ -23,4 +23,7 @@ temp/
 [._]sw[a-p]
 
 # Script output
-*.diff
+*.diff
+
+# Python pycache
+/__pycache__
diff --git a/diff_report.py b/diff_report.py
@@ -0,0 +1,116 @@
+from typing import List, Optional, Dict, Any
+from pr_diff import Pr
+
+class DiffReport:
+    """
+    Compares two PrDiff objects and produces a structured report of differences.
+    """
+
+    def __init__(self, pr1: Pr, pr2: Pr):
+        self.pr1 = pr1
+        self.pr2 = pr2
+
+        # Final structured output
+        self.identical_files: List[str] = []
+        self.different_files: Dict[str, List[Any]] = {}
+        self.generate()
+
+    def generate(self):
+        """Generate the report data structure by comparing two PR diffs."""
+        pr1_files = {f.file_path: f for f in self.pr1.pr_diff.file_diffs}
+        pr2_files = {f.file_path: f for f in self.pr2.pr_diff.file_diffs}
+
+        all_files = sorted(set(pr1_files.keys()) | set(pr2_files.keys()))
+
+        for file_path in all_files:
+            file1 = pr1_files.get(file_path)
+            file2 = pr2_files.get(file_path)
+
+            if file1 and file2:
+                if file1.md5 == file2.md5:
+                    self.identical_files.append(file_path)
+                else:
+                    aligned = self._align_hunks(file1.hunks, file2.hunks)
+                    self.different_files[file_path] = aligned
+            else:
+                aligned = []
+                if file1 and not file2:
+                    aligned = self._align_hunks(file1.hunks, [])
+                elif file2 and not file1:
+                    aligned = self._align_hunks([], file2.hunks)
+
+                self.different_files[file_path] = aligned
+
+    def _align_hunks(self, hunks1: List, hunks2: List) -> List[Dict[str, Optional[str]]]:
+        """
+        Align two lists of hunks based on md5 matching.
+        Returns a list of dicts with left/right hunks aligned.
+        """
+        aligned = []
+        h1_index, h2_index = 0, 0
+        total_h1, total_h2 = len(hunks1), len(hunks2)
+
+        while h1_index < total_h1 or h2_index < total_h2:
+            current_h1 = hunks1[h1_index] if h1_index < total_h1 else None
+            current_h2 = hunks2[h2_index] if h2_index < total_h2 else None
+
+            # CASE 1: One list is exhausted → treat remaining as extras
+            if current_h1 is not None and current_h2 is None:
+                aligned.append({"left": current_h1, "right": None})
+                h1_index += 1
+
+            elif current_h1 is None and current_h2 is not None:
+                aligned.append({"left": None, "right": current_h2})
+                h2_index += 1
+
+            # CASE 2: Both lists have hunks
+            else:
+                # CASE 2A: Direct match
+                if current_h1.md5 == current_h2.md5:
+                    aligned.append({"left": current_h1, "right": current_h2})
+                    h1_index += 1
+                    h2_index += 1
+
+                # CASE 2B: Look ahead in the right side to find match for current_h1
+                else:
+                    found_h2_match_index = None
+                    lookahead_index = h2_index + 1
+
+                    # Scan the rest of the hunks2 list
+                    while found_h2_match_index is None and lookahead_index < total_h2:
+                        if hunks2[lookahead_index].md5 == current_h1.md5:
+                            found_h2_match_index = lookahead_index
+                        else:
+                            lookahead_index += 1
+
+                    if found_h2_match_index is not None:
+                        # Extra right hunks before the match
+                        for extra_h2 in hunks2[h2_index:found_h2_match_index]:
+                            aligned.append({"left": None, "right": extra_h2})
+
+                        # Match found
+                        aligned.append({
+                            "left": current_h1,
+                            "right": hunks2[found_h2_match_index]
+                        })
+
+                        # Update both indexes
+                        h1_index += 1
+                        h2_index = found_h2_match_index + 1
+
+                    # CASE 2C: No match found at all → extra left hunk
+                    else:
+                        aligned.append({"left": current_h1, "right": None})
+                        h1_index += 1
+
+        return aligned
+
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Return the final structured report as a dictionary."""
+        return {
+            "left_pr": self.pr1,
+            "right_pr": self.pr2,
+            "identical_files": self.identical_files,
+            "different_files": self.different_files,
+        }
diff --git a/gh-compr b/gh-compr
@@ -21,7 +21,7 @@ tag_diff() {
   local diff=$1
   local prNumber=$2
   local taggedDiff
-  
+
   taggedDiff=$(echo -e "$diff" | sed -E "/^(\+\+\+|\-\-\-)/ s|$|    # [PR: ${prNumber}]|")
 
   echo "$taggedDiff"
@@ -52,39 +52,7 @@ else
   read -rp "Enter the second PR URL: " prUrl2
 fi
 
-output1=$(validate_url_extract_info "$prUrl1")
-status=$?
-if [ $status -ne 0 ]; then
-  echo "$output1"
-  exit 1
-fi
-output2=$(validate_url_extract_info "$prUrl2")
-status=$?
-if [ $status -ne 0 ]; then
-  echo "$output2"
-  exit 1
-fi
-
-read -r owner1 repo1 pr1Number <<< "$output1"
-read -r owner2 repo2 pr2Number <<< "$output2"
-
-# Fetch PR diffs using gh CLI
-pr1Diff=$(gh pr diff "$pr1Number" -R "$owner1/$repo1")
-pr2Diff=$(gh pr diff "$pr2Number" -R "$owner2/$repo2")
-
-# Remove context lines from the diffs
-# it would be nice if https://cli.github.com/manual/gh_pr_diff
-# had an option to set the number of context lines so this step
-# wouldn't be necessary
-pr1NoCtxDiff=$(echo "$pr1Diff" | grep -v '^[^+-]')
-pr2NoCtxDiff=$(echo "$pr2Diff" | grep -v '^[^+-]')
-
-# Tag headers to make sure filenames always exist in the final diff
-pr1TaggedDiff=$(tag_diff "$pr1NoCtxDiff" "$pr1Number")
-pr2TaggedDiff=$(tag_diff "$pr2NoCtxDiff" "$pr2Number")
-
-# Generate diff and save to file
-diff_output=$(diff -u0 <(echo "$pr1TaggedDiff") <(echo "$pr2TaggedDiff") || true)
+diff_output=$(python main.py "$prUrl1" "$prUrl2")
 if [ -n "$output" ]; then
   echo "$diff_output" > "$output"
   echo "Diff saved to $output"

diff --git a/main.py b/main.py
@@ -0,0 +1,27 @@
+from typing import List
+import re
+import subprocess
+import sys
+from pr_diff import Pr, PrDiff, FileDiff, Hunk
+from diff_report import DiffReport
+from report_formatter import format_pr_diff_report_markdown
+
+
+def process_input():
+    url1 = sys.argv[1]
+    url2 = sys.argv[2]
+    return url1, url2
+
+
+def main():
+    url1, url2 = process_input()
+    pr1 = Pr(url1)
+    pr2 = Pr(url2)
+    diff_report = DiffReport(pr1, pr2)
+
+    output = format_pr_diff_report_markdown(diff_report.to_dict())
+    print(output)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/pr_diff.py b/pr_diff.py
@@ -0,0 +1,133 @@
+import subprocess
+import re
+import hashlib
+from typing import List
+
+def compute_md5(data: str) -> str:
+    return hashlib.md5(data.encode('utf-8')).hexdigest()
+
+class Hunk:
+    def __init__(self, text: str):
+        self.text = text.strip()
 pr1NoCtxDiff=$(echo "$pr1Diff" | grep -v '^[^+-]') 
 pr2NoCtxDiff=$(echo "$pr2Diff" | grep -v '^[^+-]') 
 pr1NoCtxDiff=$(echo "$pr1Diff" | grep -v '^[^+-]') 
 pr2NoCtxDiff=$(echo "$pr2Diff" | grep -v '^[^+-]') 
+        self.md5 = self._compute_md5(self.text)
+
+    @staticmethod
+    def _compute_md5(data: str) -> str:
+        return hashlib.md5(data.encode('utf-8')).hexdigest()
+
+    def __repr__(self):
+        return f"<Hunk md5={self.md5} length={len(self.text)}>"
+
+    def pretty_print(self):
+        print("\t\t", self)
+
+
+class FileDiff:
+    """Represents the diff for a single file, containing multiple hunks."""
+
+    def __init__(self, file_path: str, diff_text: str):
+        self.file_path = file_path
+        self.diff_text = diff_text.strip()
+        self.md5 = compute_md5(self.diff_text)
+        self.hunks = self._parse_hunks(self.diff_text)
+
+    def _parse_hunks(self, text: str) -> List[Hunk]:
+        """Extract all hunks from the file diff."""
+        parts = re.split(r'(?=^@@ )', text, flags=re.MULTILINE)
+        return [Hunk(part) for part in parts if part.strip().startswith('@@')]
+
+    def __repr__(self):
+        return f"<FileDiff path=['{self.file_path}'] md5=[{self.md5}] hunks=[{len(self.hunks)}]>"
+
+    def pretty_print(self):
+        print("\t", self)
+        for hunk in self.hunks:
+            hunk.pretty_print()
+
+
+class PrDiff:
+    """Represents the entire PR diff, containing multiple file diffs."""
+
+    def __init__(self, diff_text: str):
+        self.diff_text = diff_text.strip()
+        self.md5 = compute_md5(self.diff_text)
+        self.file_diffs = self._parse_chunks(self.diff_text)
+
+
+    def _parse_chunks(self, diff_text: str) -> List[FileDiff]:
+        """Split the PR diff into individual file diffs."""
+        # Split on "diff --git" lines
+        raw_chunks = re.split(r'(?=^diff --git)', diff_text, flags=re.MULTILINE)
+        raw_chunks = [chunk.strip() for chunk in raw_chunks if chunk.strip()]
+
+        file_diffs = []
+        for chunk in raw_chunks:
+            lines = chunk.splitlines()
+
+            # Extract file path from the first line
+            # Format: diff --git a/path/to/file b/path/to/file
+            match = re.match(r'^diff --git a/(.+?) b/\1$', lines[0])
+            if match:
+                file_path = match.group(1)
+            else:
+                # Fallback if exact match fails
+                file_path = lines[0].split()[2][2:]
+
+            file_diffs.append(FileDiff(file_path, chunk))
+
+        return file_diffs
+
+    def __repr__(self):
+        return f"<PrDiff md5=[{self.md5}] files=[{len(self.chunks)}]>"
+
+    def pretty_print(self):
+        print(self)
+        for chunk in self.chunks:
+            chunk.pretty_print()
+
+class Pr:
+    def __init__(self, url: str):
+        owner, repo, pull_number = self._validate_url_extract_info(url)
+        self.url = url
+        self.repo = repo
+        self.number = pull_number
+
+        raw_pr_diff = self._download_pr_diff(url)
+        self.pr_diff = PrDiff(raw_pr_diff)
+
+    def __repr__(self):
+        return f"<Pr number=[{self.number}] diff_md5=[{self.pr_diff.md5}] files=[{len(self.pr_diff.chunks)}]>"
+
+    def to_markdown(self):
+        return f"PR *#{self.number}*, diff md5: *{self.pr_diff.md5}*, files: *{len(self.pr_diff.chunks)}*"
+
+
+    @staticmethod
+    def _download_pr_diff(pr_url: str):
+        completed_process = subprocess.run(["gh", "pr", "diff", pr_url], capture_output=True, text=True)
+        if completed_process.returncode != 0:
+            print(completed_process.stderr, file=sys.stderr)
+            raise RuntimeError(f"Could not download pr diff from: {pr_url}")
+        return completed_process.stdout
+
+    @staticmethod
+    def _validate_url_extract_info(url: str) -> (str, str, str):
+        """
+        Validate a GitHub pull request URL.
+
+        :param url: The GitHub PR URL to validate.
+        :return: True if successful.
+        :raises ValueError: If the URL does not match the expected format.
+        """
+        regex = r"^https://github\.com/([a-zA-Z0-9-]+)/([a-zA-Z0-9-]+)/pull/([0-9]+)$"
+        match = re.match(regex, url)
+
+        if not match:
+            raise ValueError(
+                f"PR URL '{url}' does not match the expected format: "
+                "'https://github.com/<owner>/<repo>/pull/<number>'"
+            )
+
+        owner, repo, pull_number = match.groups()
+        return owner, repo, int(pull_number)
+