Skip to content

Commit 21ac708

Browse files
committed
add commit diff streams
1 parent 83c02bf commit 21ac708

File tree

2 files changed

+160
-3
lines changed

2 files changed

+160
-3
lines changed

tap_github/repository_streams.py

Lines changed: 154 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1079,6 +1079,14 @@ def post_process(self, row: dict, context: dict | None = None) -> dict:
10791079
row["commit_timestamp"] = row["commit"]["committer"]["date"]
10801080
return row
10811081

1082+
def get_child_context(self, record: dict, context: dict | None) -> dict:
1083+
return {
1084+
"org": context["org"] if context else None,
1085+
"repo": context["repo"] if context else None,
1086+
"repo_id": context["repo_id"] if context else None,
1087+
"commit_id": record["sha"]
1088+
}
1089+
10821090
schema = th.PropertiesList(
10831091
th.Property("org", th.StringType),
10841092
th.Property("repo", th.StringType),
@@ -1161,6 +1169,73 @@ class CommitCommentsStream(GitHubRestStream):
11611169
th.Property("author_association", th.StringType),
11621170
).to_dict()
11631171

1172+
class CommitDiffsStream(GitHubRestStream):
1173+
name = "commit_diffs"
1174+
path = "/repos/{org}/{repo}/commits/{commit_id}"
1175+
primary_keys: ClassVar[list[str]] = ["commit_id"]
1176+
parent_stream_type = CommitsStream
1177+
ignore_parent_replication_key = False
1178+
state_partitioning_keys: ClassVar[list[str]] = ["repo", "org"]
1179+
1180+
@property
1181+
def http_headers(self) -> dict:
1182+
headers = super().http_headers
1183+
headers["Accept"] = "application/vnd.github.v3.diff"
1184+
return headers
1185+
1186+
def parse_response(self, response: requests.Response) -> Iterable[dict]:
1187+
"""Parse the response to yield the diff text instead of an object and prevent buffer overflow.""" # noqa: E501
1188+
if response.status_code != 200:
1189+
contents = response.json()
1190+
self.logger.info(
1191+
"Skipping commit due to %d error: %s",
1192+
response.status_code,
1193+
contents["message"],
1194+
)
1195+
yield {
1196+
"success": False,
1197+
"error_message": contents["message"],
1198+
}
1199+
return
1200+
1201+
if content_length_str := response.headers.get("Content-Length"):
1202+
content_length = int(content_length_str)
1203+
max_size = 41_943_040 # 40 MiB
1204+
if content_length > max_size:
1205+
self.logger.info(
1206+
"Skipping commit. The diff size (%.2f MiB) exceeded the maximum"
1207+
" size limit of 40 MiB.",
1208+
content_length / 1024 / 1024,
1209+
)
1210+
yield {
1211+
"success": False,
1212+
"error_message": "Diff exceeded the maximum size limit of 40 MiB.",
1213+
}
1214+
return
1215+
1216+
yield {"diff": response.text, "success": True}
1217+
1218+
def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict:
1219+
row = super().post_process(row, context)
1220+
if context is not None:
1221+
# Get commit ID (sha) from context
1222+
row["org"] = context["org"]
1223+
row["repo"] = context["repo"]
1224+
row["repo_id"] = context["repo_id"]
1225+
row["commit_id"] = context["commit_id"]
1226+
return row
1227+
1228+
schema = th.PropertiesList(
1229+
# Parent keys
1230+
th.Property("org", th.StringType),
1231+
th.Property("repo", th.StringType),
1232+
th.Property("repo_id", th.IntegerType),
1233+
th.Property("commit_id", th.StringType),
1234+
# Rest
1235+
th.Property("diff", th.StringType),
1236+
th.Property("success", th.BooleanType),
1237+
th.Property("error_message", th.StringType),
1238+
).to_dict()
11641239

11651240
class LabelsStream(GitHubRestStream):
11661241
"""Defines 'labels' stream."""
@@ -1354,14 +1429,23 @@ def get_child_context(self, record: dict, context: dict | None) -> dict:
13541429
).to_dict()
13551430

13561431

1357-
class PullRequestCommits(GitHubRestStream):
1432+
class PullRequestCommitsStream(GitHubRestStream):
13581433
name = "pull_request_commits"
13591434
path = "/repos/{org}/{repo}/pulls/{pull_number}/commits"
13601435
ignore_parent_replication_key = False
13611436
primary_keys: ClassVar[list[str]] = ["node_id"]
13621437
parent_stream_type = PullRequestsStream
13631438
state_partitioning_keys: ClassVar[list[str]] = ["repo", "org"]
13641439

1440+
def get_child_context(self, record: dict, context: dict | None) -> dict:
1441+
return {
1442+
"org": context["org"] if context else None,
1443+
"repo": context["repo"] if context else None,
1444+
"repo_id": context["repo_id"] if context else None,
1445+
"pull_number": context["pull_number"] if context else None,
1446+
"commit_id": record["sha"],
1447+
}
1448+
13651449
schema = th.PropertiesList(
13661450
# Parent keys
13671451
th.Property("org", th.StringType),
@@ -1514,6 +1598,75 @@ def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict
15141598
th.Property("error_message", th.StringType),
15151599
).to_dict()
15161600

1601+
class PullRequestCommitDiffsStream(GitHubRestStream):
1602+
name = "pull_request_commit_diffs"
1603+
path = "/repos/{org}/{repo}/commits/{commit_id}"
1604+
primary_keys: ClassVar[list[str]] = ["commit_id"]
1605+
parent_stream_type = PullRequestCommitsStream
1606+
ignore_parent_replication_key = False
1607+
state_partitioning_keys: ClassVar[list[str]] = ["repo", "org"]
1608+
1609+
@property
1610+
def http_headers(self) -> dict:
1611+
headers = super().http_headers
1612+
headers["Accept"] = "application/vnd.github.v3.diff"
1613+
return headers
1614+
1615+
def parse_response(self, response: requests.Response) -> Iterable[dict]:
1616+
"""Parse the response to yield the diff text instead of an object and prevent buffer overflow.""" # noqa: E501
1617+
if response.status_code != 200:
1618+
contents = response.json()
1619+
self.logger.info(
1620+
"Skipping commit due to %d error: %s",
1621+
response.status_code,
1622+
contents["message"],
1623+
)
1624+
yield {
1625+
"success": False,
1626+
"error_message": contents["message"],
1627+
}
1628+
return
1629+
1630+
if content_length_str := response.headers.get("Content-Length"):
1631+
content_length = int(content_length_str)
1632+
max_size = 41_943_040 # 40 MiB
1633+
if content_length > max_size:
1634+
self.logger.info(
1635+
"Skipping commit. The diff size (%.2f MiB) exceeded the maximum"
1636+
" size limit of 40 MiB.",
1637+
content_length / 1024 / 1024,
1638+
)
1639+
yield {
1640+
"success": False,
1641+
"error_message": "Diff exceeded the maximum size limit of 40 MiB.",
1642+
}
1643+
return
1644+
1645+
yield {"diff": response.text, "success": True}
1646+
1647+
def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict:
1648+
row = super().post_process(row, context)
1649+
if context is not None:
1650+
# Get commit ID (sha) from context
1651+
row["org"] = context["org"]
1652+
row["repo"] = context["repo"]
1653+
row["repo_id"] = context["repo_id"]
1654+
row["pull_number"] = context["pull_number"]
1655+
row["commit_id"] = context["commit_id"]
1656+
return row
1657+
1658+
schema = th.PropertiesList(
1659+
# Parent keys
1660+
th.Property("org", th.StringType),
1661+
th.Property("repo", th.StringType),
1662+
th.Property("repo_id", th.IntegerType),
1663+
th.Property("pull_number", th.IntegerType),
1664+
th.Property("commit_id", th.StringType),
1665+
# Rest
1666+
th.Property("diff", th.StringType),
1667+
th.Property("success", th.BooleanType),
1668+
th.Property("error_message", th.StringType),
1669+
).to_dict()
15171670

15181671
class ReviewsStream(GitHubRestStream):
15191672
name = "reviews"

tap_github/streams.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
CollaboratorsStream,
1717
CommitCommentsStream,
1818
CommitsStream,
19+
CommitDiffsStream,
1920
CommunityProfileStream,
2021
ContributorsStream,
2122
DependenciesStream,
@@ -31,7 +32,8 @@
3132
ProjectCardsStream,
3233
ProjectColumnsStream,
3334
ProjectsStream,
34-
PullRequestCommits,
35+
PullRequestCommitsStream,
36+
PullRequestCommitDiffsStream,
3537
PullRequestDiffsStream,
3638
PullRequestsStream,
3739
ReadmeHtmlStream,
@@ -79,6 +81,7 @@ def __init__(self, valid_queries: set[str], streams: list[type[Stream]]) -> None
7981
CollaboratorsStream,
8082
CommitCommentsStream,
8183
CommitsStream,
84+
CommitDiffsStream,
8285
CommunityProfileStream,
8386
ContributorsStream,
8487
DependenciesStream,
@@ -93,7 +96,8 @@ def __init__(self, valid_queries: set[str], streams: list[type[Stream]]) -> None
9396
ProjectCardsStream,
9497
ProjectColumnsStream,
9598
ProjectsStream,
96-
PullRequestCommits,
99+
PullRequestCommitsStream,
100+
PullRequestCommitDiffsStream,
97101
PullRequestDiffsStream,
98102
PullRequestsStream,
99103
ReadmeHtmlStream,

0 commit comments

Comments
 (0)