diff --git a/.github/workflows/check-pr.yml b/.github/workflows/check-pr.yml new file mode 100644 index 0000000..b963618 --- /dev/null +++ b/.github/workflows/check-pr.yml @@ -0,0 +1,54 @@ +name: Check PR + +on: + pull_request: + branches: + - master + workflow_dispatch: + +permissions: + contents: read + pull-requests: write + +jobs: + check-jobs: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python 3.8 + uses: actions/setup-python@v5 + with: + python-version: '3.8' + + - name: Install Python dependencies + run: | + pip install notion-client + pip install pangu + pip install python-slugify + pip install requests + + - name: Run parse_readme + env: + NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }} + run: PYTHONPATH=./ python jobs/parse_readme/main.py + + - name: Run parse_sample_posts + env: + NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }} + run: PYTHONPATH=./ python jobs/parse_sample_posts/main.py + + - name: Run parse_sample_posts_for_hexo + env: + NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }} + run: PYTHONPATH=./ python jobs/parse_sample_posts_for_hexo/main.py + + - name: Lock PR on failure + if: failure() + env: + GH_TOKEN: ${{ secrets.GH_TOKEN }} + run: | + echo "Job failed, locking PR..." + gh pr lock ${{ github.event.pull_request.number }} --comment "PR check failed. This PR is now locked." diff --git a/jobs/parse_readme/main.py b/jobs/parse_readme/main.py index cab4293..4c28380 100644 --- a/jobs/parse_readme/main.py +++ b/jobs/parse_readme/main.py @@ -13,30 +13,20 @@ def start(): channel = Config.writer() or 'default' NotionWriter.clean_output() - main_page = NotionReader.read_main_page() - # Official API: read_main_page returns dict, not block with children. - # We use read_all_pages to get all subpages recursively (or just use _read_post_pages logic) - # But read_all_pages filters by config. - # Here we want specific pages. - # Let's use read_all_pages but we need to ensure config doesn't filter them out? - # Config is set in __main__. - # But here we want specific titles. - # Let's use NotionReader._read_post_pages() which reads all from main page, then we filter. - # Wait, _read_post_pages() applies config filter! - # If we want ALL pages to filter manually, we might need to temporarily set config or use internal method. - # Actually, _read_post_pages calls _recurse_read_page. - # Let's just use _recurse_read_page manually or set Config.page_titles to ['all'] temporarily? - # But Config is global. - # Let's just use NotionReader.read_all_pages() and assume Config is set to 'all' (default) or we set it. - # In __main__, Config.parse_configs() is called. Default is 'all'. - # So read_all_pages() should return all pages. - - all_pages = NotionReader.read_all_pages() - source_pages = Utils.find(all_pages, lambda it: NotionReader._get_page_title(it) in [ + all_pages = [] + target_titles = [ "NotionDown README", "NotionDown GetTokenV2", "NotionDown Custom Config", - ]) + ] + + source_pages = [] + for title in target_titles: + page = NotionReader.read_page_with_title(title) + if page: + source_pages.append(page) + else: + print(f"Warning: Page not found for title: {title}") for source_page in source_pages: md_page = NotionReader._parse_page(source_page) diff --git a/jobs/parse_sample_posts_for_hexo/main.py b/jobs/parse_sample_posts_for_hexo/main.py index 2f46f0d..7e8430e 100644 --- a/jobs/parse_sample_posts_for_hexo/main.py +++ b/jobs/parse_sample_posts_for_hexo/main.py @@ -39,8 +39,9 @@ def start(): Config.set_download_image(True) Config.set_writer('Hexo') # For debug - # Config.set_blog_url("https://www.notion.so/kaedea/Noton-Down-Sample-440de7dca89840b6b3bab13d2aa92a34") - # Config.set_page_titles_match(["^(Hexo page -)"]) + if not Config.blog_url(): + Config.set_blog_url("https://www.notion.so/kaedea/Noton-Down-Sample-440de7dca89840b6b3bab13d2aa92a34") + Config.set_page_titles_match(["^(Hexo page -)"]) print("") print("Run with configs:") diff --git a/notion_reader.py b/notion_reader.py index 9d7150b..1bac3db 100644 --- a/notion_reader.py +++ b/notion_reader.py @@ -43,16 +43,11 @@ def handle_post() -> List[NotionPage]: @staticmethod def handle_page_with_title(page_title: str) -> typing.Optional[NotionPage]: print("#handle_page_with_title: " + page_title) - pages = NotionReader._read_post_pages() - # Note: Official API pages have 'properties' -> 'title' -> 'title' -> 'plain_text' - # But here 'pages' are likely Block objects or Page objects. - # We need to ensure we are filtering correctly. - # For now, assuming _read_post_pages returns a list of Page objects (dicts). - find_one = Utils.find_one(pages, lambda it: NotionReader._get_page_title(it) == page_title) - if not find_one: + page = NotionReader.read_page_with_title(page_title) + if not page: return None - return NotionReader._parse_page(find_one) + return NotionReader._parse_page(page) @staticmethod def handle_page(page) -> NotionPage: @@ -60,12 +55,16 @@ def handle_page(page) -> NotionPage: return NotionReader._parse_page(page) @staticmethod - def read_main_page() -> Dict[str, Any]: + def read_main_page() -> typing.Optional[Dict[str, Any]]: print("#read_main_page") + if not Config.blog_url(): + return None try: page_id = NotionUtils.extract_id(Config.blog_url()) return NotionReader.get_client().pages.retrieve(page_id=page_id) except Exception as e: + # If retrieval fails (e.g. invalid permissions), re-raise or logging? + # For now re-raise to be visible raise e @staticmethod @@ -73,19 +72,29 @@ def read_all_pages() -> typing.List[Dict[str, Any]]: print("#read_all_pages") return NotionReader._read_post_pages() + @staticmethod + def search_pages(query: str) -> typing.List[Dict[str, Any]]: + print("#search_pages: " + query) + response = NotionReader.get_client().search(query=query, filter={ + "value": "page", + "property": "object" + }) + return response.get('results', []) + @staticmethod def read_page_with_title(page_title: str) -> typing.Optional[Dict[str, Any]]: print("#read_page_with_title") - return Utils.find_one(NotionReader.read_all_pages(), lambda it: NotionReader._get_page_title(it) == page_title) + + # Get pages within scope (Global or Blog-scoped) + scoped_pages = NotionReader._get_scoped_pages() + + # Find the page in the scoped list + return Utils.find_one(scoped_pages, lambda it: NotionReader._get_page_title(it) == page_title) @staticmethod def _read_post_pages() -> typing.List[Dict[str, Any]]: - # get all pages - main_page = NotionReader.read_main_page() - page_blocks = [] - - # In official API, we need to list children of the main page to find sub-pages - NotionReader._recurse_read_page(page_blocks, main_page) + # Get valid pages (scoped) + page_blocks = NotionReader._get_scoped_pages() # filter by config titles = Config.page_titles() @@ -100,6 +109,102 @@ def _read_post_pages() -> typing.List[Dict[str, Any]]: )] filter_by_titles.extend([it for it in filter_by_titles_match if it not in filter_by_titles]) return filter_by_titles + + @staticmethod + def _get_scoped_pages() -> typing.List[Dict[str, Any]]: + """ + Retrieves all pages based on the configuration scope. + - If blog_url is set: Fetch all -> Filter descendants of blog_url. + - If blog_url is NOT set: Fetch all (Workspace scope). + """ + # 1. Fetch ALL pages in workspace (Search API) + all_pages = NotionReader._get_all_pages_in_workspace() + + # 2. Apply Scope + if Config.blog_url(): + print("Scope: filtering pages under blog_url...") + root_id = NotionUtils.extract_id(Config.blog_url()) + return NotionReader._filter_descendants(all_pages, root_id) + + print("Scope: global workspace") + return all_pages + + @staticmethod + def _get_all_pages_in_workspace() -> typing.List[Dict[str, Any]]: + """ + Fetches ALL pages in the workspace using the Search API with pagination. + """ + all_pages = [] + has_more = True + start_cursor = None + + print("Searching all pages in workspace...") + while has_more: + # Search for pages only + response = NotionReader.get_client().search( + filter={"value": "page", "property": "object"}, + start_cursor=start_cursor, + page_size=100 + ) + results = response.get('results', []) + all_pages.extend(results) + has_more = response.get('has_more') + start_cursor = response.get('next_cursor') + print(f"Fetched {len(results)} pages, total so far: {len(all_pages)}") + + return all_pages + + @staticmethod + def _filter_descendants(all_pages: typing.List[Dict[str, Any]], root_id: str) -> typing.List[Dict[str, Any]]: + """ + Filters the list of all pages to return only those that are descendants of the root_id. + Reconstructs the tree structure in memory. + """ + # Normalize root_id + if not root_id: + return [] + root_id = root_id.replace('-', '') + + # Build children map: parent_id -> list of child pages + children_map = {} + page_map = {} + + for page in all_pages: + pid = page.get('id').replace('-', '') + page_map[pid] = page + + parent = page.get('parent') + if parent and parent.get('type') == 'page_id': + parent_id = parent.get('page_id').replace('-', '') + if parent_id not in children_map: + children_map[parent_id] = [] + children_map[parent_id].append(page) + + # BFS to find all descendants + descendants = [] + queue = [root_id] + visited = set() + + while queue: + current_id = queue.pop(0) + if current_id in visited: + continue + visited.add(current_id) + + # If current_id is in page_map (and it's not the root iterator start), add it + # (We want to include root if it is in all_pages? The caller handles root usually, + # but let's gather everything reachable. + # If root_id is the main page content, we usually want it. + if current_id in page_map: + descendants.append(page_map[current_id]) + + # Add children to queue + if current_id in children_map: + for child in children_map[current_id]: + child_id = child.get('id').replace('-', '') + queue.append(child_id) + + return descendants @staticmethod def _recurse_read_page(page_blocks: typing.List[Dict[str, Any]], parent_page: Dict[str, Any]): diff --git a/tests/notion-sdk-py-official-apis/notion_reader_test.py b/tests/notion-sdk-py-official-apis/notion_reader_test.py index b39015d..3002de7 100644 --- a/tests/notion-sdk-py-official-apis/notion_reader_test.py +++ b/tests/notion-sdk-py-official-apis/notion_reader_test.py @@ -125,3 +125,26 @@ def test_read_notion_page_with_hexo(self): test_pages = NotionReader.read_all_pages() self.assertTrue(len(test_pages) > 0) + def test_filter_descendants(self): + # Mock pages + root = {'id': 'root', 'parent': {'type': 'workspace'}} + child1 = {'id': 'c1', 'parent': {'type': 'page_id', 'page_id': 'root'}} + child2 = {'id': 'c2', 'parent': {'type': 'page_id', 'page_id': 'root'}} + grandchild1 = {'id': 'gc1', 'parent': {'type': 'page_id', 'page_id': 'c1'}} + outsider = {'id': 'other', 'parent': {'type': 'workspace'}} + outsider_child = {'id': 'oc1', 'parent': {'type': 'page_id', 'page_id': 'other'}} + + all_pages = [root, child1, child2, grandchild1, outsider, outsider_child] + + # Test filtering + descendants = NotionReader._filter_descendants(all_pages, 'root') + ids = [p['id'] for p in descendants] + + self.assertIn('root', ids) + self.assertIn('c1', ids) + self.assertIn('c2', ids) + self.assertIn('gc1', ids) + self.assertNotIn('other', ids) + self.assertNotIn('oc1', ids) + self.assertEqual(len(ids), 4) + diff --git a/tests/notion-sdk-py-official-apis/test_optimization_consistency.py b/tests/notion-sdk-py-official-apis/test_optimization_consistency.py new file mode 100644 index 0000000..8ba1f9a --- /dev/null +++ b/tests/notion-sdk-py-official-apis/test_optimization_consistency.py @@ -0,0 +1,136 @@ +import unittest +import os +import time + +from config import Config +from notion_reader import NotionReader +from utils.utils import Utils + +class OptimizationConsistencyTest(unittest.TestCase): + + def setUp(self): + Config.parse_configs() + Config.set_debuggable(True) + # Use the standard sample blog URL used in other tests/jobs + Config.set_blog_url("https://www.notion.so/kaedea/Noton-Down-Sample-440de7dca89840b6b3bab13d2aa92a34") + Config.check_required_args() + + def test_job_parse_sample_posts(self): + """ + Job: jobs/parse_sample_posts/main.py + Config: Default (blog_url set, page_titles=['all']) + """ + print("\n=== Test Job: parse_sample_posts ===") + # Config already set in setUp + self._verify_strategies() + + def test_job_parse_sample_posts_for_hexo(self): + """ + Job: jobs/parse_sample_posts_for_hexo/main.py + Config: blog_url set, page_titles_match=["^(Hexo page -)"] + """ + print("\n=== Test Job: parse_sample_posts_for_hexo ===") + Config.set_page_titles_match(["^(Hexo page -)"]) + self._verify_strategies() + + def test_job_parse_readme(self): + """ + Job: jobs/parse_readme/main.py + Config: blog_url set, search specific titles manually. + Note: This job calls read_page_with_title multiple times. + We verify that read_page_with_title uses the scoped optimization. + """ + print("\n=== Test Job: parse_readme ===") + target_titles = [ + "NotionDown README", + "NotionDown GetTokenV2", + "NotionDown Custom Config", + ] + + # 1. Optimized + print("\n[Optimized Strategy] Running...") + start_time = time.time() + optimized_pages = [] + for title in target_titles: + p = NotionReader.read_page_with_title(title) + if p: optimized_pages.append(p) + print(f"[Optimized Strategy] Done in {time.time() - start_time:.2f}s. Found {len(optimized_pages)} pages.") + + # 2. Legacy (Simulated) + # Old read_page_with_title would call read_all_pages() -> _read_post_pages() -> recurse + # And then find_one. + print("\n[Legacy Strategy] Running...") + start_time = time.time() + legacy_pages = [] + + # Simulate fetch all recursively + main_page = NotionReader.read_main_page() + all_recurse_pages = [] + if main_page: + NotionReader._recurse_read_page(all_recurse_pages, main_page) + + for title in target_titles: + # Find in all_recurse_pages + p = Utils.find_one(all_recurse_pages, lambda it: NotionReader._get_page_title(it) == title) + if p: legacy_pages.append(p) + + print(f"[Legacy Strategy] Done in {time.time() - start_time:.2f}s. Found {len(legacy_pages)} pages.") + + self._compare_page_lists(optimized_pages, legacy_pages) + + + def _verify_strategies(self): + # 1. Run Optimized + print("\n[Optimized Strategy] Running...") + start_time = time.time() + optimized_pages = NotionReader._read_post_pages() + print(f"[Optimized Strategy] Done in {time.time() - start_time:.2f}s. Found {len(optimized_pages)} pages.") + + # 2. Run Legacy + print("\n[Legacy Strategy] Running...") + start_time = time.time() + main_page = NotionReader.read_main_page() + legacy_pages = [] + if main_page: + NotionReader._recurse_read_page(legacy_pages, main_page) + # Apply same filter as _read_post_pages does + legacy_pages = self._apply_title_filter(legacy_pages) + + print(f"[Legacy Strategy] Done in {time.time() - start_time:.2f}s. Found {len(legacy_pages)} pages.") + + # 3. Compare + self._compare_page_lists(optimized_pages, legacy_pages) + + + def _apply_title_filter(self, page_blocks): + # Replicating the filter logic from NotionReader._read_post_pages + import re + titles = Config.page_titles() + titles_match = Config.page_titles_match() or [] + + if titles == ['all'] and (not titles_match or len(titles_match) == 0): + return page_blocks + + filter_by_titles = [it for it in page_blocks if NotionReader._get_page_title(it) in titles] + filter_by_titles_match = [it for it in page_blocks if Utils.find_one( + titles_match, + lambda match: re.compile(match).match(NotionReader._get_page_title(it)) + )] + filter_by_titles.extend([it for it in filter_by_titles_match if it not in filter_by_titles]) + return filter_by_titles + + def _compare_page_lists(self, list_a, list_b): + """ + Asserts that two lists of Notion pages contain identical sets of pages by ID. + """ + ids_a = sorted([p['id'].replace('-', '') for p in list_a]) + ids_b = sorted([p['id'].replace('-', '') for p in list_b]) + + # Check count + self.assertEqual(len(ids_a), len(ids_b), f"Page count mismatch! Optimized: {len(ids_a)}, Legacy: {len(ids_b)}") + + # Check IDs + for id_a, id_b in zip(ids_a, ids_b): + self.assertEqual(id_a, id_b, f"Mismatch found! ID {id_a} vs {id_b}") + + print("Verification Successful: Both strategies returned identical page sets.")