@@ -1079,6 +1079,14 @@ def post_process(self, row: dict, context: dict | None = None) -> dict:
10791079 row ["commit_timestamp" ] = row ["commit" ]["committer" ]["date" ]
10801080 return row
10811081
1082+ def get_child_context (self , record : dict , context : dict | None ) -> dict :
1083+ return {
1084+ "org" : context ["org" ] if context else None ,
1085+ "repo" : context ["repo" ] if context else None ,
1086+ "repo_id" : context ["repo_id" ] if context else None ,
1087+ "commit_id" : record ["sha" ]
1088+ }
1089+
10821090 schema = th .PropertiesList (
10831091 th .Property ("org" , th .StringType ),
10841092 th .Property ("repo" , th .StringType ),
@@ -1161,6 +1169,73 @@ class CommitCommentsStream(GitHubRestStream):
11611169 th .Property ("author_association" , th .StringType ),
11621170 ).to_dict ()
11631171
1172+ class CommitDiffsStream (GitHubRestStream ):
1173+ name = "commit_diffs"
1174+ path = "/repos/{org}/{repo}/commits/{commit_id}"
1175+ primary_keys : ClassVar [list [str ]] = ["commit_id" ]
1176+ parent_stream_type = CommitsStream
1177+ ignore_parent_replication_key = False
1178+ state_partitioning_keys : ClassVar [list [str ]] = ["repo" , "org" ]
1179+
1180+ @property
1181+ def http_headers (self ) -> dict :
1182+ headers = super ().http_headers
1183+ headers ["Accept" ] = "application/vnd.github.v3.diff"
1184+ return headers
1185+
1186+ def parse_response (self , response : requests .Response ) -> Iterable [dict ]:
1187+ """Parse the response to yield the diff text instead of an object and prevent buffer overflow.""" # noqa: E501
1188+ if response .status_code != 200 :
1189+ contents = response .json ()
1190+ self .logger .info (
1191+ "Skipping commit due to %d error: %s" ,
1192+ response .status_code ,
1193+ contents ["message" ],
1194+ )
1195+ yield {
1196+ "success" : False ,
1197+ "error_message" : contents ["message" ],
1198+ }
1199+ return
1200+
1201+ if content_length_str := response .headers .get ("Content-Length" ):
1202+ content_length = int (content_length_str )
1203+ max_size = 41_943_040 # 40 MiB
1204+ if content_length > max_size :
1205+ self .logger .info (
1206+ "Skipping commit. The diff size (%.2f MiB) exceeded the maximum"
1207+ " size limit of 40 MiB." ,
1208+ content_length / 1024 / 1024 ,
1209+ )
1210+ yield {
1211+ "success" : False ,
1212+ "error_message" : "Diff exceeded the maximum size limit of 40 MiB." ,
1213+ }
1214+ return
1215+
1216+ yield {"diff" : response .text , "success" : True }
1217+
1218+ def post_process (self , row : dict , context : dict [str , str ] | None = None ) -> dict :
1219+ row = super ().post_process (row , context )
1220+ if context is not None :
1221+ # Get commit ID (sha) from context
1222+ row ["org" ] = context ["org" ]
1223+ row ["repo" ] = context ["repo" ]
1224+ row ["repo_id" ] = context ["repo_id" ]
1225+ row ["commit_id" ] = context ["commit_id" ]
1226+ return row
1227+
1228+ schema = th .PropertiesList (
1229+ # Parent keys
1230+ th .Property ("org" , th .StringType ),
1231+ th .Property ("repo" , th .StringType ),
1232+ th .Property ("repo_id" , th .IntegerType ),
1233+ th .Property ("commit_id" , th .StringType ),
1234+ # Rest
1235+ th .Property ("diff" , th .StringType ),
1236+ th .Property ("success" , th .BooleanType ),
1237+ th .Property ("error_message" , th .StringType ),
1238+ ).to_dict ()
11641239
11651240class LabelsStream (GitHubRestStream ):
11661241 """Defines 'labels' stream."""
@@ -1354,14 +1429,23 @@ def get_child_context(self, record: dict, context: dict | None) -> dict:
13541429 ).to_dict ()
13551430
13561431
1357- class PullRequestCommits (GitHubRestStream ):
1432+ class PullRequestCommitsStream (GitHubRestStream ):
13581433 name = "pull_request_commits"
13591434 path = "/repos/{org}/{repo}/pulls/{pull_number}/commits"
13601435 ignore_parent_replication_key = False
13611436 primary_keys : ClassVar [list [str ]] = ["node_id" ]
13621437 parent_stream_type = PullRequestsStream
13631438 state_partitioning_keys : ClassVar [list [str ]] = ["repo" , "org" ]
13641439
1440+ def get_child_context (self , record : dict , context : dict | None ) -> dict :
1441+ return {
1442+ "org" : context ["org" ] if context else None ,
1443+ "repo" : context ["repo" ] if context else None ,
1444+ "repo_id" : context ["repo_id" ] if context else None ,
1445+ "pull_number" : context ["pull_number" ] if context else None ,
1446+ "commit_id" : record ["sha" ],
1447+ }
1448+
13651449 schema = th .PropertiesList (
13661450 # Parent keys
13671451 th .Property ("org" , th .StringType ),
@@ -1514,6 +1598,75 @@ def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict
15141598 th .Property ("error_message" , th .StringType ),
15151599 ).to_dict ()
15161600
1601+ class PullRequestCommitDiffsStream (GitHubRestStream ):
1602+ name = "pull_request_commit_diffs"
1603+ path = "/repos/{org}/{repo}/commits/{commit_id}"
1604+ primary_keys : ClassVar [list [str ]] = ["commit_id" ]
1605+ parent_stream_type = PullRequestCommitsStream
1606+ ignore_parent_replication_key = False
1607+ state_partitioning_keys : ClassVar [list [str ]] = ["repo" , "org" ]
1608+
1609+ @property
1610+ def http_headers (self ) -> dict :
1611+ headers = super ().http_headers
1612+ headers ["Accept" ] = "application/vnd.github.v3.diff"
1613+ return headers
1614+
1615+ def parse_response (self , response : requests .Response ) -> Iterable [dict ]:
1616+ """Parse the response to yield the diff text instead of an object and prevent buffer overflow.""" # noqa: E501
1617+ if response .status_code != 200 :
1618+ contents = response .json ()
1619+ self .logger .info (
1620+ "Skipping commit due to %d error: %s" ,
1621+ response .status_code ,
1622+ contents ["message" ],
1623+ )
1624+ yield {
1625+ "success" : False ,
1626+ "error_message" : contents ["message" ],
1627+ }
1628+ return
1629+
1630+ if content_length_str := response .headers .get ("Content-Length" ):
1631+ content_length = int (content_length_str )
1632+ max_size = 41_943_040 # 40 MiB
1633+ if content_length > max_size :
1634+ self .logger .info (
1635+ "Skipping commit. The diff size (%.2f MiB) exceeded the maximum"
1636+ " size limit of 40 MiB." ,
1637+ content_length / 1024 / 1024 ,
1638+ )
1639+ yield {
1640+ "success" : False ,
1641+ "error_message" : "Diff exceeded the maximum size limit of 40 MiB." ,
1642+ }
1643+ return
1644+
1645+ yield {"diff" : response .text , "success" : True }
1646+
1647+ def post_process (self , row : dict , context : dict [str , str ] | None = None ) -> dict :
1648+ row = super ().post_process (row , context )
1649+ if context is not None :
1650+ # Get commit ID (sha) from context
1651+ row ["org" ] = context ["org" ]
1652+ row ["repo" ] = context ["repo" ]
1653+ row ["repo_id" ] = context ["repo_id" ]
1654+ row ["pull_number" ] = context ["pull_number" ]
1655+ row ["commit_id" ] = context ["commit_id" ]
1656+ return row
1657+
1658+ schema = th .PropertiesList (
1659+ # Parent keys
1660+ th .Property ("org" , th .StringType ),
1661+ th .Property ("repo" , th .StringType ),
1662+ th .Property ("repo_id" , th .IntegerType ),
1663+ th .Property ("pull_number" , th .IntegerType ),
1664+ th .Property ("commit_id" , th .StringType ),
1665+ # Rest
1666+ th .Property ("diff" , th .StringType ),
1667+ th .Property ("success" , th .BooleanType ),
1668+ th .Property ("error_message" , th .StringType ),
1669+ ).to_dict ()
15171670
15181671class ReviewsStream (GitHubRestStream ):
15191672 name = "reviews"
0 commit comments