|
1 | 1 | import asyncio |
2 | 2 | import typing as t |
3 | 3 | from datetime import datetime |
| 4 | +import time |
4 | 5 |
|
5 | 6 | from notion_client import AsyncClient |
| 7 | +from notion_client.errors import RequestTimeoutError, APIResponseError |
6 | 8 | from notion_client.helpers import async_collect_paginated_api |
7 | 9 | from tqdm import tqdm # type: ignore |
8 | 10 |
|
9 | 11 | from _types import AttackType, Paper, Focus |
10 | 12 |
|
| 13 | +# Retry constants |
| 14 | +MAX_RETRIES = 5 |
| 15 | +RETRY_DELAY = 5 |
| 16 | +MAX_BATCH_SIZE = 5 |
| 17 | + |
11 | 18 | NotionClient = AsyncClient |
12 | 19 |
|
13 | 20 |
|
14 | 21 | def get_notion_client(token: str) -> NotionClient: |
15 | | - return NotionClient(auth=token) |
| 22 | + return NotionClient(auth=token, timeout_ms=60000) # 60-second timeout |
16 | 23 |
|
17 | 24 |
|
18 | 25 | async def get_papers_from_notion(client: NotionClient, database_id: str, *, max: int | None = None) -> list[Paper]: |
19 | | - if max: |
20 | | - results = await client.databases.query(database_id=database_id, page_size=max) |
21 | | - results = results['results'] |
22 | | - else: |
23 | | - results = await async_collect_paginated_api( |
24 | | - client.databases.query, database_id=database_id |
25 | | - ) |
| 26 | + retries = 0 |
| 27 | + results = [] |
| 28 | + |
| 29 | + while retries < MAX_RETRIES: |
| 30 | + try: |
| 31 | + if max: |
| 32 | + response = await client.databases.query(database_id=database_id, page_size=max) |
| 33 | + results = response['results'] |
| 34 | + else: |
| 35 | + results = await async_collect_paginated_api( |
| 36 | + client.databases.query, database_id=database_id |
| 37 | + ) |
| 38 | + break |
| 39 | + except (RequestTimeoutError, APIResponseError) as e: |
| 40 | + retries += 1 |
| 41 | + if retries >= MAX_RETRIES: |
| 42 | + print(f"Failed to get papers from Notion after {MAX_RETRIES} attempts: {str(e)}") |
| 43 | + return [] |
| 44 | + else: |
| 45 | + print(f"Notion API error when fetching papers, retrying ({retries}/{MAX_RETRIES}): {str(e)}") |
| 46 | + # Exponential backoff with jitter |
| 47 | + wait_time = RETRY_DELAY * (2 ** (retries - 1)) + (RETRY_DELAY * 0.1 * retries) |
| 48 | + print(f"Waiting {wait_time:.1f} seconds before retry...") |
| 49 | + await asyncio.sleep(wait_time) |
26 | 50 |
|
27 | 51 | papers: list[Paper] = [] |
28 | 52 | for result in results: |
@@ -67,34 +91,64 @@ async def get_papers_from_notion(client: NotionClient, database_id: str, *, max: |
67 | 91 | async def write_papers_to_notion( |
68 | 92 | client: NotionClient, database_id: str, papers: list[Paper] |
69 | 93 | ) -> None: |
70 | | - for paper in tqdm(papers): |
71 | | - properties: dict[str, t.Any] = {} |
72 | | - if paper.title and paper._original_state["title"] != paper.title: |
73 | | - properties["Title"] = {"title": [{"text": {"content": paper.title}}]} |
74 | | - if paper.url and paper._original_state["url"] != paper.url: |
75 | | - properties["URL"] = {"url": paper.url} |
76 | | - if paper.summary and paper._original_state["summary"] != paper.summary: |
77 | | - properties["Summary"] = { |
78 | | - "rich_text": [{"text": {"content": paper.summary}}] |
79 | | - } |
80 | | - if paper.authors and paper._original_state["authors"] != paper.authors: |
81 | | - properties["Authors"] = { |
82 | | - "multi_select": [{"name": author} for author in paper.authors[:5]] # Limit to 5 authors |
83 | | - } |
84 | | - if paper.published and paper._original_state["published"] != paper.published: |
85 | | - properties["Published"] = {"date": {"start": paper.published.isoformat()}} |
86 | | - if paper.focus and paper._original_state["focus"] != paper.focus: |
87 | | - properties["Focus"] = {"select": {"name": paper.focus.value}} |
88 | | - if paper.attack_type and paper._original_state["attack_type"] != paper.attack_type: |
89 | | - properties["Attack Type"] = {"select": {"name": paper.attack_type.value}} |
90 | | - if paper.explored and paper._original_state["explored"] != paper.explored: |
91 | | - properties["Explored"] = {"checkbox": paper.explored} |
92 | | - |
93 | | - if paper.page_id: |
94 | | - await client.pages.update(paper.page_id, properties=properties) |
95 | | - else: |
96 | | - await client.pages.create( |
97 | | - parent={"database_id": database_id}, properties=properties |
98 | | - ) |
| 94 | + # Process papers in smaller batches with pauses between |
| 95 | + for i in range(0, len(papers), MAX_BATCH_SIZE): |
| 96 | + batch = papers[i:i+MAX_BATCH_SIZE] |
| 97 | + print(f"Processing batch {i//MAX_BATCH_SIZE + 1}/{(len(papers) + MAX_BATCH_SIZE - 1)//MAX_BATCH_SIZE}") |
| 98 | + |
| 99 | + for paper in tqdm(batch): |
| 100 | + properties: dict[str, t.Any] = {} |
| 101 | + if paper.title: |
| 102 | + properties["Title"] = {"title": [{"text": {"content": paper.title}}]} |
| 103 | + if paper.url: |
| 104 | + properties["URL"] = {"url": paper.url} |
| 105 | + if paper.summary: |
| 106 | + properties["Summary"] = { |
| 107 | + "rich_text": [{"text": {"content": paper.summary}}] |
| 108 | + } |
| 109 | + if paper.authors: |
| 110 | + properties["Authors"] = { |
| 111 | + "multi_select": [{"name": author} for author in paper.authors] |
| 112 | + } |
| 113 | + if paper.published: |
| 114 | + properties["Published"] = {"date": {"start": paper.published.isoformat()}} |
| 115 | + if paper.focus: |
| 116 | + properties["Focus"] = {"select": {"name": paper.focus.value}} |
| 117 | + if paper.attack_type: |
| 118 | + properties["Attack Type"] = {"select": {"name": paper.attack_type.value}} |
| 119 | + if paper.explored is not None: |
| 120 | + properties["Explored"] = {"checkbox": paper.explored} |
| 121 | + |
| 122 | + # Retry logic with progressive backoff |
| 123 | + retries = 0 |
| 124 | + while retries < MAX_RETRIES: |
| 125 | + try: |
| 126 | + if paper.page_id: |
| 127 | + await client.pages.update(paper.page_id, properties=properties) |
| 128 | + else: |
| 129 | + await client.pages.create( |
| 130 | + parent={"database_id": database_id}, properties=properties |
| 131 | + ) |
| 132 | + # Success, break out of retry loop |
| 133 | + break |
| 134 | + except (RequestTimeoutError, APIResponseError) as e: |
| 135 | + retries += 1 |
| 136 | + if retries >= MAX_RETRIES: |
| 137 | + print(f"Failed to update/create paper after {MAX_RETRIES} attempts: {paper.title[:50]}...") |
| 138 | + # Don't raise - continue with other papers |
| 139 | + break |
| 140 | + else: |
| 141 | + print(f"Notion API error, retrying ({retries}/{MAX_RETRIES}): {str(e)}") |
| 142 | + # Exponential backoff with longer delays |
| 143 | + wait_time = RETRY_DELAY * (2 ** (retries - 1)) + (RETRY_DELAY * 0.1 * retries) |
| 144 | + print(f"Waiting {wait_time:.1f} seconds before retry...") |
| 145 | + await asyncio.sleep(wait_time) |
| 146 | + |
| 147 | + # Add a small delay between papers regardless of success/failure |
| 148 | + await asyncio.sleep(1) |
| 149 | + |
| 150 | + if i + MAX_BATCH_SIZE < len(papers): |
| 151 | + print(f"Pausing for 10 seconds between batches...") |
| 152 | + await asyncio.sleep(10) |
99 | 153 |
|
100 | 154 | return None |
0 commit comments