Skip to content

Commit c4098be

Browse files
committed
refactor diff logic into new stream class
1 parent aae637d commit c4098be

File tree

2 files changed

+49
-120
lines changed

2 files changed

+49
-120
lines changed

tap_github/client.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,51 @@ def calculate_sync_cost(
305305
"""Return the cost of the last REST API call."""
306306
return {"rest": 1, "graphql": 0, "search": 0}
307307

308+
class GitHubDiffStream(GitHubRestStream):
309+
"""Base class for GitHub diff streams."""
310+
311+
@property
312+
def http_headers(self) -> dict:
313+
"""Return the http headers needed for diff requests."""
314+
headers = super().http_headers
315+
headers["Accept"] = "application/vnd.github.v3.diff"
316+
return headers
317+
318+
def parse_response(self, response: requests.Response) -> Iterable[dict]:
319+
"""Parse the response to yield the diff text instead of an object
320+
and prevent buffer overflow."""
321+
if response.status_code != 200:
322+
contents = response.json()
323+
self.logger.info(
324+
"Skipping %s due to %d error: %s",
325+
self.name.replace("_", " "),
326+
response.status_code,
327+
contents["message"],
328+
)
329+
yield {
330+
"success": False,
331+
"error_message": contents["message"],
332+
}
333+
return
334+
335+
if content_length_str := response.headers.get("Content-Length"):
336+
content_length = int(content_length_str)
337+
max_size = 41_943_040 # 40 MiB
338+
if content_length > max_size:
339+
self.logger.info(
340+
"Skipping %s. The diff size (%.2f MiB) exceeded the maximum"
341+
" size limit of 40 MiB.",
342+
self.name.replace("_", " "),
343+
content_length / 1024 / 1024,
344+
)
345+
yield {
346+
"success": False,
347+
"error_message": "Diff exceeded the maximum size limit of 40 MiB.",
348+
}
349+
return
350+
351+
yield {"diff": response.text, "success": True}
352+
308353

309354
class GitHubGraphqlStream(GraphQLStream, GitHubRestStream):
310355
"""GitHub Graphql stream class."""

tap_github/repository_streams.py

Lines changed: 4 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from singer_sdk.exceptions import FatalAPIError
1111
from singer_sdk.helpers.jsonpath import extract_jsonpath
1212

13-
from tap_github.client import GitHubGraphqlStream, GitHubRestStream
13+
from tap_github.client import GitHubDiffStream, GitHubGraphqlStream, GitHubRestStream
1414
from tap_github.schema_objects import (
1515
files_object,
1616
label_object,
@@ -1169,53 +1169,14 @@ class CommitCommentsStream(GitHubRestStream):
11691169
th.Property("author_association", th.StringType),
11701170
).to_dict()
11711171

1172-
1173-
class CommitDiffsStream(GitHubRestStream):
1172+
class CommitDiffsStream(GitHubDiffStream):
11741173
name = "commit_diffs"
11751174
path = "/repos/{org}/{repo}/commits/{commit_id}"
11761175
primary_keys: ClassVar[list[str]] = ["commit_id"]
11771176
parent_stream_type = CommitsStream
11781177
ignore_parent_replication_key = False
11791178
state_partitioning_keys: ClassVar[list[str]] = ["repo", "org"]
11801179

1181-
@property
1182-
def http_headers(self) -> dict:
1183-
headers = super().http_headers
1184-
headers["Accept"] = "application/vnd.github.v3.diff"
1185-
return headers
1186-
1187-
def parse_response(self, response: requests.Response) -> Iterable[dict]:
1188-
"""Parse the response to yield the diff text instead of an object and prevent buffer overflow.""" # noqa: E501
1189-
if response.status_code != 200:
1190-
contents = response.json()
1191-
self.logger.info(
1192-
"Skipping commit due to %d error: %s",
1193-
response.status_code,
1194-
contents["message"],
1195-
)
1196-
yield {
1197-
"success": False,
1198-
"error_message": contents["message"],
1199-
}
1200-
return
1201-
1202-
if content_length_str := response.headers.get("Content-Length"):
1203-
content_length = int(content_length_str)
1204-
max_size = 41_943_040 # 40 MiB
1205-
if content_length > max_size:
1206-
self.logger.info(
1207-
"Skipping commit. The diff size (%.2f MiB) exceeded the maximum"
1208-
" size limit of 40 MiB.",
1209-
content_length / 1024 / 1024,
1210-
)
1211-
yield {
1212-
"success": False,
1213-
"error_message": "Diff exceeded the maximum size limit of 40 MiB.",
1214-
}
1215-
return
1216-
1217-
yield {"diff": response.text, "success": True}
1218-
12191180
def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict:
12201181
row = super().post_process(row, context)
12211182
if context is not None:
@@ -1529,7 +1490,7 @@ def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict
15291490
return row
15301491

15311492

1532-
class PullRequestDiffsStream(GitHubRestStream):
1493+
class PullRequestDiffsStream(GitHubDiffStream):
15331494
name = "pull_request_diffs"
15341495
path = "/repos/{org}/{repo}/pulls/{pull_number}"
15351496
primary_keys: ClassVar[list[str]] = ["pull_id"]
@@ -1539,44 +1500,6 @@ class PullRequestDiffsStream(GitHubRestStream):
15391500
# Known Github API errors
15401501
tolerated_http_errors: ClassVar[list[int]] = [404, 406, 422, 502]
15411502

1542-
@property
1543-
def http_headers(self) -> dict:
1544-
headers = super().http_headers
1545-
headers["Accept"] = "application/vnd.github.v3.diff"
1546-
return headers
1547-
1548-
def parse_response(self, response: requests.Response) -> Iterable[dict]:
1549-
"""Parse the response to yield the diff text instead of an object and prevent buffer overflow.""" # noqa: E501
1550-
if response.status_code != 200:
1551-
contents = response.json()
1552-
self.logger.info(
1553-
"Skipping PR due to %d error: %s",
1554-
response.status_code,
1555-
contents["message"],
1556-
)
1557-
yield {
1558-
"success": False,
1559-
"error_message": contents["message"],
1560-
}
1561-
return
1562-
1563-
if content_length_str := response.headers.get("Content-Length"):
1564-
content_length = int(content_length_str)
1565-
max_size = 41_943_040 # 40 MiB
1566-
if content_length > max_size:
1567-
self.logger.info(
1568-
"Skipping PR. The diff size (%.2f MiB) exceeded the maximum size "
1569-
"limit of 40 MiB.",
1570-
content_length / 1024 / 1024,
1571-
)
1572-
yield {
1573-
"success": False,
1574-
"error_message": "Diff exceeded the maximum size limit of 40 MiB.",
1575-
}
1576-
return
1577-
1578-
yield {"diff": response.text, "success": True}
1579-
15801503
def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict:
15811504
row = super().post_process(row, context)
15821505
if context is not None:
@@ -1601,53 +1524,14 @@ def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict
16011524
th.Property("error_message", th.StringType),
16021525
).to_dict()
16031526

1604-
1605-
class PullRequestCommitDiffsStream(GitHubRestStream):
1527+
class PullRequestCommitDiffsStream(GitHubDiffStream):
16061528
name = "pull_request_commit_diffs"
16071529
path = "/repos/{org}/{repo}/commits/{commit_id}"
16081530
primary_keys: ClassVar[list[str]] = ["commit_id"]
16091531
parent_stream_type = PullRequestCommitsStream
16101532
ignore_parent_replication_key = False
16111533
state_partitioning_keys: ClassVar[list[str]] = ["repo", "org"]
16121534

1613-
@property
1614-
def http_headers(self) -> dict:
1615-
headers = super().http_headers
1616-
headers["Accept"] = "application/vnd.github.v3.diff"
1617-
return headers
1618-
1619-
def parse_response(self, response: requests.Response) -> Iterable[dict]:
1620-
"""Parse the response to yield the diff text instead of an object and prevent buffer overflow.""" # noqa: E501
1621-
if response.status_code != 200:
1622-
contents = response.json()
1623-
self.logger.info(
1624-
"Skipping commit due to %d error: %s",
1625-
response.status_code,
1626-
contents["message"],
1627-
)
1628-
yield {
1629-
"success": False,
1630-
"error_message": contents["message"],
1631-
}
1632-
return
1633-
1634-
if content_length_str := response.headers.get("Content-Length"):
1635-
content_length = int(content_length_str)
1636-
max_size = 41_943_040 # 40 MiB
1637-
if content_length > max_size:
1638-
self.logger.info(
1639-
"Skipping commit. The diff size (%.2f MiB) exceeded the maximum"
1640-
" size limit of 40 MiB.",
1641-
content_length / 1024 / 1024,
1642-
)
1643-
yield {
1644-
"success": False,
1645-
"error_message": "Diff exceeded the maximum size limit of 40 MiB.",
1646-
}
1647-
return
1648-
1649-
yield {"diff": response.text, "success": True}
1650-
16511535
def post_process(self, row: dict, context: dict[str, str] | None = None) -> dict:
16521536
row = super().post_process(row, context)
16531537
if context is not None:

0 commit comments

Comments
 (0)