kaedea · kaedea · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025 · Dec 12, 2025
diff --git a/.github/workflows/check-pr.yml b/.github/workflows/check-pr.yml
@@ -0,0 +1,54 @@
+name: Check PR
+
+on:
+  pull_request:
+    branches:
+      - master
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  check-jobs:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.8'
+
+      - name: Install Python dependencies
+        run: |
+          pip install notion-client
+          pip install pangu
+          pip install python-slugify
+          pip install requests
+
+      - name: Run parse_readme
+        env:
+          NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }}
+        run: PYTHONPATH=./ python jobs/parse_readme/main.py
+
+      - name: Run parse_sample_posts
+        env:
+          NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }}
+        run: PYTHONPATH=./ python jobs/parse_sample_posts/main.py
+
+      - name: Run parse_sample_posts_for_hexo
+        env:
+          NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }}
+        run: PYTHONPATH=./ python jobs/parse_sample_posts_for_hexo/main.py
+
+      - name: Lock PR on failure
+        if: failure()
+        env:
+          GH_TOKEN: ${{ secrets.GH_TOKEN }}
+        run: |
+          echo "Job failed, locking PR..."
+          gh pr lock ${{ github.event.pull_request.number }} --comment "PR check failed. This PR is now locked."
diff --git a/jobs/parse_readme/main.py b/jobs/parse_readme/main.py
@@ -13,30 +13,20 @@ def start():
     channel = Config.writer() or 'default'
     NotionWriter.clean_output()
 
-    main_page = NotionReader.read_main_page()
-    # Official API: read_main_page returns dict, not block with children.
-    # We use read_all_pages to get all subpages recursively (or just use _read_post_pages logic)
-    # But read_all_pages filters by config.
-    # Here we want specific pages.
-    # Let's use read_all_pages but we need to ensure config doesn't filter them out?
-    # Config is set in __main__.
-    # But here we want specific titles.
-    # Let's use NotionReader._read_post_pages() which reads all from main page, then we filter.
-    # Wait, _read_post_pages() applies config filter!
-    # If we want ALL pages to filter manually, we might need to temporarily set config or use internal method.
-    # Actually, _read_post_pages calls _recurse_read_page.
-    # Let's just use _recurse_read_page manually or set Config.page_titles to ['all'] temporarily?
-    # But Config is global.
-    # Let's just use NotionReader.read_all_pages() and assume Config is set to 'all' (default) or we set it.
-    # In __main__, Config.parse_configs() is called. Default is 'all'.
-    # So read_all_pages() should return all pages.
-
-    all_pages = NotionReader.read_all_pages()
-    source_pages = Utils.find(all_pages, lambda it: NotionReader._get_page_title(it) in [
+    all_pages = []
+    target_titles = [
         "NotionDown README",
         "NotionDown GetTokenV2",
         "NotionDown Custom Config",
-    ])
+    ]
+
+    source_pages = []
+    for title in target_titles:
+        page = NotionReader.read_page_with_title(title)
+        if page:
+             source_pages.append(page)
+        else:
+            print(f"Warning: Page not found for title: {title}")
 
     for source_page in source_pages:
         md_page = NotionReader._parse_page(source_page)

diff --git a/jobs/parse_sample_posts_for_hexo/main.py b/jobs/parse_sample_posts_for_hexo/main.py
@@ -39,8 +39,9 @@ def start():
     Config.set_download_image(True)
     Config.set_writer('Hexo')
     # For debug
-    # Config.set_blog_url("https://www.notion.so/kaedea/Noton-Down-Sample-440de7dca89840b6b3bab13d2aa92a34")
-    # Config.set_page_titles_match(["^(Hexo page -)"])
+    if not Config.blog_url():
+        Config.set_blog_url("https://www.notion.so/kaedea/Noton-Down-Sample-440de7dca89840b6b3bab13d2aa92a34")
+        Config.set_page_titles_match(["^(Hexo page -)"])
 
     print("")
     print("Run with configs:")

diff --git a/notion_reader.py b/notion_reader.py
@@ -43,49 +43,58 @@ def handle_post() -> List[NotionPage]:
     @staticmethod
     def handle_page_with_title(page_title: str) -> typing.Optional[NotionPage]:
         print("#handle_page_with_title: " + page_title)
-        pages = NotionReader._read_post_pages()
-        # Note: Official API pages have 'properties' -> 'title' -> 'title' -> 'plain_text'
-        # But here 'pages' are likely Block objects or Page objects.
-        # We need to ensure we are filtering correctly.
-        # For now, assuming _read_post_pages returns a list of Page objects (dicts).
 
-        find_one = Utils.find_one(pages, lambda it: NotionReader._get_page_title(it) == page_title)
-        if not find_one:
+        page = NotionReader.read_page_with_title(page_title)
+        if not page:
             return None
-        return NotionReader._parse_page(find_one)
+        return NotionReader._parse_page(page)
 
     @staticmethod
     def handle_page(page) -> NotionPage:
         print("#handle_single_page")
         return NotionReader._parse_page(page)
 
     @staticmethod
-    def read_main_page() -> Dict[str, Any]:
+    def read_main_page() -> typing.Optional[Dict[str, Any]]:
         print("#read_main_page")
+        if not Config.blog_url():
+            return None
         try:
             page_id = NotionUtils.extract_id(Config.blog_url())
             return NotionReader.get_client().pages.retrieve(page_id=page_id)
         except Exception as e:
+            # If retrieval fails (e.g. invalid permissions), re-raise or logging?
+            # For now re-raise to be visible
             raise e
 
     @staticmethod
     def read_all_pages() -> typing.List[Dict[str, Any]]:
         print("#read_all_pages")
         return NotionReader._read_post_pages()
 
+    @staticmethod
+    def search_pages(query: str) -> typing.List[Dict[str, Any]]:
+        print("#search_pages: " + query)
+        response = NotionReader.get_client().search(query=query, filter={
+            "value": "page",
+            "property": "object"
+        })
+        return response.get('results', [])
+
     @staticmethod
     def read_page_with_title(page_title: str) -> typing.Optional[Dict[str, Any]]:
         print("#read_page_with_title")
-        return Utils.find_one(NotionReader.read_all_pages(), lambda it: NotionReader._get_page_title(it) == page_title)
+
+        # Get pages within scope (Global or Blog-scoped)
+        scoped_pages = NotionReader._get_scoped_pages()
+
+        # Find the page in the scoped list
+        return Utils.find_one(scoped_pages, lambda it: NotionReader._get_page_title(it) == page_title)
 
     @staticmethod
     def _read_post_pages() -> typing.List[Dict[str, Any]]:
-        # get all pages
-        main_page = NotionReader.read_main_page()
-        page_blocks = []
-
-        # In official API, we need to list children of the main page to find sub-pages
-        NotionReader._recurse_read_page(page_blocks, main_page)
+        # Get valid pages (scoped)
+        page_blocks = NotionReader._get_scoped_pages()
 
         # filter by config
         titles = Config.page_titles()
@@ -100,6 +109,102 @@ def _read_post_pages() -> typing.List[Dict[str, Any]]:
         )]
         filter_by_titles.extend([it for it in filter_by_titles_match if it not in filter_by_titles])
         return filter_by_titles
+
+    @staticmethod
+    def _get_scoped_pages() -> typing.List[Dict[str, Any]]:
+        """
+        Retrieves all pages based on the configuration scope.
+        - If blog_url is set: Fetch all -> Filter descendants of blog_url.
+        - If blog_url is NOT set: Fetch all (Workspace scope).
+        """
+        # 1. Fetch ALL pages in workspace (Search API)
+        all_pages = NotionReader._get_all_pages_in_workspace()
+
+        # 2. Apply Scope
+        if Config.blog_url():
+            print("Scope: filtering pages under blog_url...")
+            root_id = NotionUtils.extract_id(Config.blog_url())
+            return NotionReader._filter_descendants(all_pages, root_id)
+
+        print("Scope: global workspace")
+        return all_pages
+
+    @staticmethod
+    def _get_all_pages_in_workspace() -> typing.List[Dict[str, Any]]:
+        """
+        Fetches ALL pages in the workspace using the Search API with pagination.
+        """
+        all_pages = []
+        has_more = True
+        start_cursor = None
+
+        print("Searching all pages in workspace...")
+        while has_more:
+            # Search for pages only
+            response = NotionReader.get_client().search(
+                filter={"value": "page", "property": "object"},
+                start_cursor=start_cursor,
+                page_size=100
+            )
+            results = response.get('results', [])
+            all_pages.extend(results)
+            has_more = response.get('has_more')
+            start_cursor = response.get('next_cursor')
+            print(f"Fetched {len(results)} pages, total so far: {len(all_pages)}")
+
+        return all_pages
+
+    @staticmethod
+    def _filter_descendants(all_pages: typing.List[Dict[str, Any]], root_id: str) -> typing.List[Dict[str, Any]]:
+        """
+        Filters the list of all pages to return only those that are descendants of the root_id.
+        Reconstructs the tree structure in memory.
+        """
+        # Normalize root_id
+        if not root_id:
+            return []
+        root_id = root_id.replace('-', '')
+
+        # Build children map: parent_id -> list of child pages
+        children_map = {}
+        page_map = {}
+
+        for page in all_pages:
+            pid = page.get('id').replace('-', '')
+            page_map[pid] = page
+
+            parent = page.get('parent')
+            if parent and parent.get('type') == 'page_id':
+                parent_id = parent.get('page_id').replace('-', '')
+                if parent_id not in children_map:
+                    children_map[parent_id] = []
+                children_map[parent_id].append(page)
+
+        # BFS to find all descendants
+        descendants = []
+        queue = [root_id]
+        visited = set()
+
+        while queue:
+            current_id = queue.pop(0)
+            if current_id in visited:
+                continue
+            visited.add(current_id)
+
+            # If current_id is in page_map (and it's not the root iterator start), add it
+            # (We want to include root if it is in all_pages? The caller handles root usually, 
+            # but let's gather everything reachable. 
+            # If root_id is the main page content, we usually want it.
+            if current_id in page_map:
+                descendants.append(page_map[current_id])
+
+            # Add children to queue
+            if current_id in children_map:
+                for child in children_map[current_id]:
+                    child_id = child.get('id').replace('-', '')
+                    queue.append(child_id)
+
+        return descendants
 
     @staticmethod
     def _recurse_read_page(page_blocks: typing.List[Dict[str, Any]], parent_page: Dict[str, Any]):

diff --git a/tests/notion-sdk-py-official-apis/notion_reader_test.py b/tests/notion-sdk-py-official-apis/notion_reader_test.py
@@ -125,3 +125,26 @@ def test_read_notion_page_with_hexo(self):
         test_pages = NotionReader.read_all_pages()
         self.assertTrue(len(test_pages) > 0)
 
+    def test_filter_descendants(self):
+        # Mock pages
+        root = {'id': 'root', 'parent': {'type': 'workspace'}}
+        child1 = {'id': 'c1', 'parent': {'type': 'page_id', 'page_id': 'root'}}
+        child2 = {'id': 'c2', 'parent': {'type': 'page_id', 'page_id': 'root'}}
+        grandchild1 = {'id': 'gc1', 'parent': {'type': 'page_id', 'page_id': 'c1'}}
+        outsider = {'id': 'other', 'parent': {'type': 'workspace'}}
+        outsider_child = {'id': 'oc1', 'parent': {'type': 'page_id', 'page_id': 'other'}}
+
+        all_pages = [root, child1, child2, grandchild1, outsider, outsider_child]
+
+        # Test filtering
+        descendants = NotionReader._filter_descendants(all_pages, 'root')
+        ids = [p['id'] for p in descendants]
+
+        self.assertIn('root', ids)
+        self.assertIn('c1', ids)
+        self.assertIn('c2', ids)
+        self.assertIn('gc1', ids)
+        self.assertNotIn('other', ids)
+        self.assertNotIn('oc1', ids)
+        self.assertEqual(len(ids), 4)
+