Skip to content

Commit 01de802

Browse files
committed
fix sitemap building
1 parent 2040165 commit 01de802

File tree

1 file changed

+189
-13
lines changed

1 file changed

+189
-13
lines changed

build.py

Lines changed: 189 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -258,10 +258,12 @@ def get_last_mod_date() -> dict[str, str]:
258258
)
259259
except subprocess.CalledProcessError:
260260
print("Not a git repository. Using current date as last modification date.")
261-
return {page.stem: now for page in template_dir.glob("*.html")}
261+
return {page.stem: now for page in content_dir.glob("*.yaml")}
262262

263263
last_mod_dates = {}
264-
for page in template_dir.glob("*.html"):
264+
265+
# Content YAML files
266+
for page in content_dir.glob("*.yaml"):
265267
page_name = page.stem
266268
try:
267269
result = subprocess.run(
@@ -278,32 +280,206 @@ def get_last_mod_date() -> dict[str, str]:
278280
.replace(" +0100", " +01:00")
279281
.replace(" +0200", " +02:00")
280282
)
281-
formatted_date = date_obj.strftime("%Y-%m-%d")
282-
last_mod_dates[page_name] = formatted_date
283+
last_mod_dates[page_name] = date_obj.strftime("%Y-%m-%d")
283284
except subprocess.CalledProcessError as e:
284285
print(f"Error getting git log for {page}: {e}")
285286
last_mod_dates[page_name] = now
287+
288+
# Posts (markdown files)
289+
posts_dir = content_dir / "posts"
290+
if posts_dir.exists():
291+
latest_post_date = None
292+
for post in posts_dir.glob("*.md"):
293+
post_key = f"post:{post.stem}" # e.g., "post:my-first-post"
294+
try:
295+
result = subprocess.run(
296+
["git", "log", "-n", "1", "--format=%ci", "--", post],
297+
check=True,
298+
capture_output=True,
299+
text=True,
300+
)
301+
date_str = result.stdout.strip()
302+
if not date_str:
303+
continue
304+
date_obj = datetime.fromisoformat(
305+
date_str.replace("Z", "+00:00")
306+
.replace(" +0100", " +01:00")
307+
.replace(" +0200", " +02:00")
308+
)
309+
formatted_date = date_obj.strftime("%Y-%m-%d")
310+
last_mod_dates[post_key] = formatted_date
311+
312+
# Track latest post date for posts index
313+
if latest_post_date is None or date_obj > latest_post_date:
314+
latest_post_date = date_obj
315+
except subprocess.CalledProcessError as e:
316+
print(f"Error getting git log for {post}: {e}")
317+
last_mod_dates[post_key] = now
318+
319+
# Posts index uses the date of the most recently modified post
320+
if latest_post_date:
321+
last_mod_dates["posts"] = latest_post_date.strftime("%Y-%m-%d")
322+
286323
return last_mod_dates
287324

288325

326+
# def get_manual_mod_dates() -> dict[str, str]:
327+
# """Get last modification dates for lab-manual pages via GitHub API."""
328+
# now = datetime.now().strftime("%Y-%m-%d")
329+
# mod_dates = {}
330+
331+
# repo = config["manual_repo"]
332+
# api_base = f"https://api.github.com/repos/{repo}/commits"
333+
334+
# req = requests.get(
335+
# f"https://api.github.com/repos/{repo}/contents/source",
336+
# headers={"Accept": "application/vnd.github.v3+json"},
337+
# )
338+
# if req.status_code != 200:
339+
# return {}
340+
341+
# for file_info in req.json():
342+
# if not file_info["name"].endswith(".md"):
343+
# continue
344+
# page_name = file_info["name"].replace(".md", "")
345+
346+
# # Get last commit for this file
347+
# commits_req = requests.get(
348+
# api_base,
349+
# params={"path": f"source/{file_info['name']}", "per_page": 1},
350+
# headers={"Accept": "application/vnd.github.v3+json"},
351+
# )
352+
# if commits_req.status_code == 200 and commits_req.json():
353+
# commit_date = commits_req.json()[0]["commit"]["committer"]["date"]
354+
# date_obj = datetime.fromisoformat(commit_date.replace("Z", "+00:00"))
355+
# mod_dates[f"lab-manual:{page_name}"] = date_obj.strftime("%Y-%m-%d")
356+
# else:
357+
# mod_dates[f"lab-manual:{page_name}"] = now
358+
359+
# return mod_dates
360+
361+
362+
def get_manual_mod_dates() -> dict[str, str]:
363+
"""Clone manual repo shallowly and get git log dates."""
364+
import tempfile
365+
366+
now = datetime.now().strftime("%Y-%m-%d")
367+
mod_dates = {}
368+
repo = config["manual_repo"]
369+
370+
with tempfile.TemporaryDirectory() as tmpdir:
371+
# Shallow clone with enough history to get meaningful dates
372+
subprocess.run(
373+
["git", "clone", "--depth", "50", f"https://github.com/{repo}.git", tmpdir],
374+
check=True,
375+
capture_output=True,
376+
)
377+
378+
# Get page order from Makefile (same logic as build_manual)
379+
makefile = Path(tmpdir) / "Makefile"
380+
page_order = [
381+
p.split(".md")[0]
382+
for p in makefile.read_text().split("\n")
383+
if p.startswith("source/") or p.endswith(".md \\")
384+
]
385+
386+
latest_manual_date = None
387+
388+
for page in page_order:
389+
if page == "README":
390+
page_slug = "index"
391+
file_path = "README.md"
392+
else:
393+
page_slug = page.replace("source/", "").lower()
394+
file_path = f"{page}.md"
395+
396+
try:
397+
result = subprocess.run(
398+
[
399+
"git",
400+
"-C",
401+
tmpdir,
402+
"log",
403+
"-n",
404+
"1",
405+
"--format=%ci",
406+
"--",
407+
file_path,
408+
],
409+
check=True,
410+
capture_output=True,
411+
text=True,
412+
)
413+
date_str = result.stdout.strip()
414+
if not date_str:
415+
mod_dates[f"lab-manual:{page_slug}"] = now
416+
continue
417+
418+
date_obj = datetime.fromisoformat(
419+
date_str.replace("Z", "+00:00")
420+
.replace(" +0100", " +01:00")
421+
.replace(" +0200", " +02:00")
422+
)
423+
formatted_date = date_obj.strftime("%Y-%m-%d")
424+
mod_dates[f"lab-manual:{page_slug}"] = formatted_date
425+
426+
# Track latest for manual index
427+
if latest_manual_date is None or date_obj > latest_manual_date:
428+
latest_manual_date = date_obj
429+
430+
except subprocess.CalledProcessError as e:
431+
print(f"Error getting git log for {file_path}: {e}")
432+
mod_dates[f"lab-manual:{page_slug}"] = now
433+
434+
# Manual index uses the most recently modified page
435+
if latest_manual_date:
436+
mod_dates["lab-manual"] = latest_manual_date.strftime("%Y-%m-%d")
437+
438+
return mod_dates
439+
440+
289441
def make_sitemap():
290442
now = datetime.now().strftime("%Y-%m-%d")
443+
mod_dates = get_last_mod_date() | get_manual_mod_dates()
291444

292-
mod_dates = get_last_mod_date()
445+
# Build reverse mapping: build file path -> page key
446+
file_to_page = {config["pages"][page]["file"]: page for page in config["pages"]}
293447

294448
sitemap = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
295449

296450
for page in build_dir.glob("**/*.html"):
297-
page_name = page.parent.name if page.parent != build_dir else page.stem
298-
url = config["deploy_url"] + str(page.relative_to(build_dir)).replace(
299-
"index.html", ""
300-
)
451+
relative_path = str(page.relative_to(build_dir))
452+
page_key = file_to_page.get(relative_path)
453+
454+
# Handle posts
455+
if (
456+
page_key is None
457+
and relative_path.startswith("p/")
458+
and relative_path.count("/") == 2
459+
):
460+
post_slug = relative_path.split("/")[1]
461+
page_key = f"post:{post_slug}"
462+
# Handle manual pages
463+
elif page_key is None and relative_path.startswith("lab-manual/"):
464+
manual_page_slug = relative_path.replace("manual/", "").replace(
465+
"/index.html", ""
466+
)
467+
if manual_page_slug == "index":
468+
manual_page_slug = "index"
469+
else:
470+
manual_page_slug = manual_page_slug.replace("lab-", "")
471+
page_key = f"lab-manual:{manual_page_slug}"
472+
473+
url = config["deploy_url"] + relative_path.replace("index.html", "")
301474
url_element = ET.SubElement(sitemap, "url")
302475
ET.SubElement(url_element, "loc").text = url
303-
try:
304-
ET.SubElement(url_element, "lastmod").text = mod_dates[page_name]
305-
except KeyError:
306-
# new page
476+
477+
if page_key and page_key in mod_dates:
478+
ET.SubElement(url_element, "lastmod").text = mod_dates[page_key]
479+
else:
480+
print(
481+
f"No last modification date found for {relative_path}. Using current date."
482+
)
307483
ET.SubElement(url_element, "lastmod").text = now
308484

309485
tree = ET.ElementTree(sitemap)

0 commit comments

Comments
 (0)