Skip to content

Commit 7ebdf3f

Browse files
chore: add backoff and retry logic to paperstack wflow
1 parent 43f6769 commit 7ebdf3f

File tree

4 files changed

+117
-58
lines changed

4 files changed

+117
-58
lines changed

.DS_Store

6 KB
Binary file not shown.

.github/workflows/core.yml

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,35 +4,39 @@ on:
44
workflow_call:
55
inputs:
66
search-arxiv:
7-
description: 'Search Arxiv?'
7+
description: "Search Arxiv?"
88
required: true
9-
type: 'boolean'
9+
type: "boolean"
1010
search-scholar:
11-
description: 'Search Semantic Scholar?'
11+
description: "Search Semantic Scholar?"
1212
required: true
13-
type: 'boolean'
13+
type: "boolean"
14+
push:
15+
branches:
16+
- "ads/eng-1935-paperstack-1-notion-api-timeouts-causing-paperstack"
1417

1518
jobs:
1619
run-script:
1720
runs-on: ubuntu-latest
21+
timeout-minutes: 120 # Adding 2-hour timeout for the entire job
1822
steps:
19-
- name: Check out code
20-
uses: actions/checkout@v2
23+
- name: Check out code
24+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 #v4.2.2
2125

22-
- name: Set up Python
23-
uses: actions/setup-python@v2
24-
with:
25-
python-version: '3.x'
26+
- name: Set up Python
27+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 #v5.6.0
28+
with:
29+
python-version: "3.x"
2630

27-
- name: Install dependencies
28-
run: |
29-
python -m pip install --upgrade pip
30-
pip install -r requirements.txt
31+
- name: Install dependencies
32+
run: |
33+
python -m pip install --upgrade pip
34+
pip install -r requirements.txt
3135
32-
- name: Run paperstack
33-
run: |
34-
python paperstack.py ${{ inputs.search-arxiv && '--search-arxiv' || '' }} ${{ inputs.search-scholar && '--search-semantic-scholar' || '' }}
35-
env:
36-
NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }}
37-
NOTION_DATABASE_ID: ${{ secrets.NOTION_DATABASE_ID }}
38-
OPENAI_API_TOKEN: ${{ secrets.OPENAI_API_TOKEN }}
36+
- name: Run paperstack
37+
run: |
38+
python paperstack.py ${{ inputs.search-arxiv && '--search-arxiv' || '' }} ${{ inputs.search-scholar && '--search-semantic-scholar' || '' }}
39+
env:
40+
NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }}
41+
NOTION_DATABASE_ID: ${{ secrets.NOTION_DATABASE_ID }}
42+
OPENAI_API_TOKEN: ${{ secrets.OPENAI_API_TOKEN }}

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,4 @@ cython_debug/
158158
# and can be added to the global gitignore or merged into this file. For a more nuclear
159159
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160160
#.idea/
161+
notion_utils.py

notion_utils.py

Lines changed: 91 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,52 @@
11
import asyncio
22
import typing as t
33
from datetime import datetime
4+
import time
45

56
from notion_client import AsyncClient
7+
from notion_client.errors import RequestTimeoutError, APIResponseError
68
from notion_client.helpers import async_collect_paginated_api
79
from tqdm import tqdm # type: ignore
810

911
from _types import AttackType, Paper, Focus
1012

13+
# Retry constants
14+
MAX_RETRIES = 5
15+
RETRY_DELAY = 5
16+
MAX_BATCH_SIZE = 5
17+
1118
NotionClient = AsyncClient
1219

1320

1421
def get_notion_client(token: str) -> NotionClient:
15-
return NotionClient(auth=token)
22+
return NotionClient(auth=token, timeout_ms=60000) # 60-second timeout
1623

1724

1825
async def get_papers_from_notion(client: NotionClient, database_id: str, *, max: int | None = None) -> list[Paper]:
19-
if max:
20-
results = await client.databases.query(database_id=database_id, page_size=max)
21-
results = results['results']
22-
else:
23-
results = await async_collect_paginated_api(
24-
client.databases.query, database_id=database_id
25-
)
26+
retries = 0
27+
results = []
28+
29+
while retries < MAX_RETRIES:
30+
try:
31+
if max:
32+
response = await client.databases.query(database_id=database_id, page_size=max)
33+
results = response['results']
34+
else:
35+
results = await async_collect_paginated_api(
36+
client.databases.query, database_id=database_id
37+
)
38+
break
39+
except (RequestTimeoutError, APIResponseError) as e:
40+
retries += 1
41+
if retries >= MAX_RETRIES:
42+
print(f"Failed to get papers from Notion after {MAX_RETRIES} attempts: {str(e)}")
43+
return []
44+
else:
45+
print(f"Notion API error when fetching papers, retrying ({retries}/{MAX_RETRIES}): {str(e)}")
46+
# Exponential backoff with jitter
47+
wait_time = RETRY_DELAY * (2 ** (retries - 1)) + (RETRY_DELAY * 0.1 * retries)
48+
print(f"Waiting {wait_time:.1f} seconds before retry...")
49+
await asyncio.sleep(wait_time)
2650

2751
papers: list[Paper] = []
2852
for result in results:
@@ -67,34 +91,64 @@ async def get_papers_from_notion(client: NotionClient, database_id: str, *, max:
6791
async def write_papers_to_notion(
6892
client: NotionClient, database_id: str, papers: list[Paper]
6993
) -> None:
70-
for paper in tqdm(papers):
71-
properties: dict[str, t.Any] = {}
72-
if paper.title and paper._original_state["title"] != paper.title:
73-
properties["Title"] = {"title": [{"text": {"content": paper.title}}]}
74-
if paper.url and paper._original_state["url"] != paper.url:
75-
properties["URL"] = {"url": paper.url}
76-
if paper.summary and paper._original_state["summary"] != paper.summary:
77-
properties["Summary"] = {
78-
"rich_text": [{"text": {"content": paper.summary}}]
79-
}
80-
if paper.authors and paper._original_state["authors"] != paper.authors:
81-
properties["Authors"] = {
82-
"multi_select": [{"name": author} for author in paper.authors[:5]] # Limit to 5 authors
83-
}
84-
if paper.published and paper._original_state["published"] != paper.published:
85-
properties["Published"] = {"date": {"start": paper.published.isoformat()}}
86-
if paper.focus and paper._original_state["focus"] != paper.focus:
87-
properties["Focus"] = {"select": {"name": paper.focus.value}}
88-
if paper.attack_type and paper._original_state["attack_type"] != paper.attack_type:
89-
properties["Attack Type"] = {"select": {"name": paper.attack_type.value}}
90-
if paper.explored and paper._original_state["explored"] != paper.explored:
91-
properties["Explored"] = {"checkbox": paper.explored}
92-
93-
if paper.page_id:
94-
await client.pages.update(paper.page_id, properties=properties)
95-
else:
96-
await client.pages.create(
97-
parent={"database_id": database_id}, properties=properties
98-
)
94+
# Process papers in smaller batches with pauses between
95+
for i in range(0, len(papers), MAX_BATCH_SIZE):
96+
batch = papers[i:i+MAX_BATCH_SIZE]
97+
print(f"Processing batch {i//MAX_BATCH_SIZE + 1}/{(len(papers) + MAX_BATCH_SIZE - 1)//MAX_BATCH_SIZE}")
98+
99+
for paper in tqdm(batch):
100+
properties: dict[str, t.Any] = {}
101+
if paper.title:
102+
properties["Title"] = {"title": [{"text": {"content": paper.title}}]}
103+
if paper.url:
104+
properties["URL"] = {"url": paper.url}
105+
if paper.summary:
106+
properties["Summary"] = {
107+
"rich_text": [{"text": {"content": paper.summary}}]
108+
}
109+
if paper.authors:
110+
properties["Authors"] = {
111+
"multi_select": [{"name": author} for author in paper.authors]
112+
}
113+
if paper.published:
114+
properties["Published"] = {"date": {"start": paper.published.isoformat()}}
115+
if paper.focus:
116+
properties["Focus"] = {"select": {"name": paper.focus.value}}
117+
if paper.attack_type:
118+
properties["Attack Type"] = {"select": {"name": paper.attack_type.value}}
119+
if paper.explored is not None:
120+
properties["Explored"] = {"checkbox": paper.explored}
121+
122+
# Retry logic with progressive backoff
123+
retries = 0
124+
while retries < MAX_RETRIES:
125+
try:
126+
if paper.page_id:
127+
await client.pages.update(paper.page_id, properties=properties)
128+
else:
129+
await client.pages.create(
130+
parent={"database_id": database_id}, properties=properties
131+
)
132+
# Success, break out of retry loop
133+
break
134+
except (RequestTimeoutError, APIResponseError) as e:
135+
retries += 1
136+
if retries >= MAX_RETRIES:
137+
print(f"Failed to update/create paper after {MAX_RETRIES} attempts: {paper.title[:50]}...")
138+
# Don't raise - continue with other papers
139+
break
140+
else:
141+
print(f"Notion API error, retrying ({retries}/{MAX_RETRIES}): {str(e)}")
142+
# Exponential backoff with longer delays
143+
wait_time = RETRY_DELAY * (2 ** (retries - 1)) + (RETRY_DELAY * 0.1 * retries)
144+
print(f"Waiting {wait_time:.1f} seconds before retry...")
145+
await asyncio.sleep(wait_time)
146+
147+
# Add a small delay between papers regardless of success/failure
148+
await asyncio.sleep(1)
149+
150+
if i + MAX_BATCH_SIZE < len(papers):
151+
print(f"Pausing for 10 seconds between batches...")
152+
await asyncio.sleep(10)
99153

100154
return None

0 commit comments

Comments
 (0)