Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions .github/workflows/check-pr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: Check PR

on:
pull_request:
branches:
- master
workflow_dispatch:

permissions:
contents: read
pull-requests: write

jobs:
check-jobs:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python 3.8
uses: actions/setup-python@v5
with:
python-version: '3.8'

- name: Install Python dependencies
run: |
pip install notion-client
pip install pangu
pip install python-slugify
pip install requests

- name: Run parse_readme
env:
NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }}
run: PYTHONPATH=./ python jobs/parse_readme/main.py

- name: Run parse_sample_posts
env:
NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }}
run: PYTHONPATH=./ python jobs/parse_sample_posts/main.py

- name: Run parse_sample_posts_for_hexo
env:
NOTION_TOKEN: ${{ secrets.NOTION_TOKEN }}
run: PYTHONPATH=./ python jobs/parse_sample_posts_for_hexo/main.py

- name: Lock PR on failure
if: failure()
env:
GH_TOKEN: ${{ secrets.GH_TOKEN }}
run: |
echo "Job failed, locking PR..."
gh pr lock ${{ github.event.pull_request.number }} --comment "PR check failed. This PR is now locked."
32 changes: 11 additions & 21 deletions jobs/parse_readme/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,20 @@ def start():
channel = Config.writer() or 'default'
NotionWriter.clean_output()

main_page = NotionReader.read_main_page()
# Official API: read_main_page returns dict, not block with children.
# We use read_all_pages to get all subpages recursively (or just use _read_post_pages logic)
# But read_all_pages filters by config.
# Here we want specific pages.
# Let's use read_all_pages but we need to ensure config doesn't filter them out?
# Config is set in __main__.
# But here we want specific titles.
# Let's use NotionReader._read_post_pages() which reads all from main page, then we filter.
# Wait, _read_post_pages() applies config filter!
# If we want ALL pages to filter manually, we might need to temporarily set config or use internal method.
# Actually, _read_post_pages calls _recurse_read_page.
# Let's just use _recurse_read_page manually or set Config.page_titles to ['all'] temporarily?
# But Config is global.
# Let's just use NotionReader.read_all_pages() and assume Config is set to 'all' (default) or we set it.
# In __main__, Config.parse_configs() is called. Default is 'all'.
# So read_all_pages() should return all pages.

all_pages = NotionReader.read_all_pages()
source_pages = Utils.find(all_pages, lambda it: NotionReader._get_page_title(it) in [
all_pages = []
target_titles = [
"NotionDown README",
"NotionDown GetTokenV2",
"NotionDown Custom Config",
])
]

source_pages = []
for title in target_titles:
page = NotionReader.read_page_with_title(title)
if page:
source_pages.append(page)
else:
print(f"Warning: Page not found for title: {title}")

for source_page in source_pages:
md_page = NotionReader._parse_page(source_page)
Expand Down
5 changes: 3 additions & 2 deletions jobs/parse_sample_posts_for_hexo/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ def start():
Config.set_download_image(True)
Config.set_writer('Hexo')
# For debug
# Config.set_blog_url("https://www.notion.so/kaedea/Noton-Down-Sample-440de7dca89840b6b3bab13d2aa92a34")
# Config.set_page_titles_match(["^(Hexo page -)"])
if not Config.blog_url():
Config.set_blog_url("https://www.notion.so/kaedea/Noton-Down-Sample-440de7dca89840b6b3bab13d2aa92a34")
Config.set_page_titles_match(["^(Hexo page -)"])

print("")
print("Run with configs:")
Expand Down
137 changes: 121 additions & 16 deletions notion_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,49 +43,58 @@ def handle_post() -> List[NotionPage]:
@staticmethod
def handle_page_with_title(page_title: str) -> typing.Optional[NotionPage]:
print("#handle_page_with_title: " + page_title)
pages = NotionReader._read_post_pages()
# Note: Official API pages have 'properties' -> 'title' -> 'title' -> 'plain_text'
# But here 'pages' are likely Block objects or Page objects.
# We need to ensure we are filtering correctly.
# For now, assuming _read_post_pages returns a list of Page objects (dicts).

find_one = Utils.find_one(pages, lambda it: NotionReader._get_page_title(it) == page_title)
if not find_one:
page = NotionReader.read_page_with_title(page_title)
if not page:
return None
return NotionReader._parse_page(find_one)
return NotionReader._parse_page(page)

@staticmethod
def handle_page(page) -> NotionPage:
print("#handle_single_page")
return NotionReader._parse_page(page)

@staticmethod
def read_main_page() -> Dict[str, Any]:
def read_main_page() -> typing.Optional[Dict[str, Any]]:
print("#read_main_page")
if not Config.blog_url():
return None
try:
page_id = NotionUtils.extract_id(Config.blog_url())
return NotionReader.get_client().pages.retrieve(page_id=page_id)
except Exception as e:
# If retrieval fails (e.g. invalid permissions), re-raise or logging?
# For now re-raise to be visible
raise e

@staticmethod
def read_all_pages() -> typing.List[Dict[str, Any]]:
print("#read_all_pages")
return NotionReader._read_post_pages()

@staticmethod
def search_pages(query: str) -> typing.List[Dict[str, Any]]:
print("#search_pages: " + query)
response = NotionReader.get_client().search(query=query, filter={
"value": "page",
"property": "object"
})
return response.get('results', [])

@staticmethod
def read_page_with_title(page_title: str) -> typing.Optional[Dict[str, Any]]:
print("#read_page_with_title")
return Utils.find_one(NotionReader.read_all_pages(), lambda it: NotionReader._get_page_title(it) == page_title)

# Get pages within scope (Global or Blog-scoped)
scoped_pages = NotionReader._get_scoped_pages()

# Find the page in the scoped list
return Utils.find_one(scoped_pages, lambda it: NotionReader._get_page_title(it) == page_title)

@staticmethod
def _read_post_pages() -> typing.List[Dict[str, Any]]:
# get all pages
main_page = NotionReader.read_main_page()
page_blocks = []

# In official API, we need to list children of the main page to find sub-pages
NotionReader._recurse_read_page(page_blocks, main_page)
# Get valid pages (scoped)
page_blocks = NotionReader._get_scoped_pages()

# filter by config
titles = Config.page_titles()
Expand All @@ -100,6 +109,102 @@ def _read_post_pages() -> typing.List[Dict[str, Any]]:
)]
filter_by_titles.extend([it for it in filter_by_titles_match if it not in filter_by_titles])
return filter_by_titles

@staticmethod
def _get_scoped_pages() -> typing.List[Dict[str, Any]]:
"""
Retrieves all pages based on the configuration scope.
- If blog_url is set: Fetch all -> Filter descendants of blog_url.
- If blog_url is NOT set: Fetch all (Workspace scope).
"""
# 1. Fetch ALL pages in workspace (Search API)
all_pages = NotionReader._get_all_pages_in_workspace()

# 2. Apply Scope
if Config.blog_url():
print("Scope: filtering pages under blog_url...")
root_id = NotionUtils.extract_id(Config.blog_url())
return NotionReader._filter_descendants(all_pages, root_id)

print("Scope: global workspace")
return all_pages

@staticmethod
def _get_all_pages_in_workspace() -> typing.List[Dict[str, Any]]:
"""
Fetches ALL pages in the workspace using the Search API with pagination.
"""
all_pages = []
has_more = True
start_cursor = None

print("Searching all pages in workspace...")
while has_more:
# Search for pages only
response = NotionReader.get_client().search(
filter={"value": "page", "property": "object"},
start_cursor=start_cursor,
page_size=100
)
results = response.get('results', [])
all_pages.extend(results)
has_more = response.get('has_more')
start_cursor = response.get('next_cursor')
print(f"Fetched {len(results)} pages, total so far: {len(all_pages)}")

return all_pages

@staticmethod
def _filter_descendants(all_pages: typing.List[Dict[str, Any]], root_id: str) -> typing.List[Dict[str, Any]]:
"""
Filters the list of all pages to return only those that are descendants of the root_id.
Reconstructs the tree structure in memory.
"""
# Normalize root_id
if not root_id:
return []
root_id = root_id.replace('-', '')

# Build children map: parent_id -> list of child pages
children_map = {}
page_map = {}

for page in all_pages:
pid = page.get('id').replace('-', '')
page_map[pid] = page

parent = page.get('parent')
if parent and parent.get('type') == 'page_id':
parent_id = parent.get('page_id').replace('-', '')
if parent_id not in children_map:
children_map[parent_id] = []
children_map[parent_id].append(page)

# BFS to find all descendants
descendants = []
queue = [root_id]
visited = set()

while queue:
current_id = queue.pop(0)
if current_id in visited:
continue
visited.add(current_id)

# If current_id is in page_map (and it's not the root iterator start), add it
# (We want to include root if it is in all_pages? The caller handles root usually,
# but let's gather everything reachable.
# If root_id is the main page content, we usually want it.
if current_id in page_map:
descendants.append(page_map[current_id])

# Add children to queue
if current_id in children_map:
for child in children_map[current_id]:
child_id = child.get('id').replace('-', '')
queue.append(child_id)

return descendants

@staticmethod
def _recurse_read_page(page_blocks: typing.List[Dict[str, Any]], parent_page: Dict[str, Any]):
Expand Down
23 changes: 23 additions & 0 deletions tests/notion-sdk-py-official-apis/notion_reader_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,26 @@ def test_read_notion_page_with_hexo(self):
test_pages = NotionReader.read_all_pages()
self.assertTrue(len(test_pages) > 0)

def test_filter_descendants(self):
# Mock pages
root = {'id': 'root', 'parent': {'type': 'workspace'}}
child1 = {'id': 'c1', 'parent': {'type': 'page_id', 'page_id': 'root'}}
child2 = {'id': 'c2', 'parent': {'type': 'page_id', 'page_id': 'root'}}
grandchild1 = {'id': 'gc1', 'parent': {'type': 'page_id', 'page_id': 'c1'}}
outsider = {'id': 'other', 'parent': {'type': 'workspace'}}
outsider_child = {'id': 'oc1', 'parent': {'type': 'page_id', 'page_id': 'other'}}

all_pages = [root, child1, child2, grandchild1, outsider, outsider_child]

# Test filtering
descendants = NotionReader._filter_descendants(all_pages, 'root')
ids = [p['id'] for p in descendants]

self.assertIn('root', ids)
self.assertIn('c1', ids)
self.assertIn('c2', ids)
self.assertIn('gc1', ids)
self.assertNotIn('other', ids)
self.assertNotIn('oc1', ids)
self.assertEqual(len(ids), 4)

Loading