1+ import sqlite3
2+ import re
3+ import yaml
4+ from pathlib import Path
5+
6+ DB_PATH = Path (__file__ ).parent .parent .parent / "db" / "man_pages" / "man-pages-db.db"
7+ CATEGORIZED_DIR = Path ("/home/lince/ubuntu-sitemaps/categorized" )
8+
9+ def normalize_subcategory (subcat ):
10+ subcat = subcat .lower ()
11+ subcat = re .sub (r'[^a-z0-9]+' , '-' , subcat )
12+ subcat = re .sub (r'-+' , '-' , subcat ).strip ('-' )
13+ return subcat
14+
15+ def extract_frontmatter (md_file ):
16+ with open (md_file , "r" , encoding = "utf-8" ) as f :
17+ content = f .read ()
18+ if content .startswith ('---' ):
19+ parts = content .split ('---' , 2 )
20+ if len (parts ) >= 3 :
21+ front_matter = parts [1 ].strip ()
22+ try :
23+ metadata = yaml .safe_load (front_matter ) or {}
24+ return metadata
25+ except Exception :
26+ return {}
27+ return {}
28+
29+ def main ():
30+ conn = sqlite3 .connect (DB_PATH )
31+ cur = conn .cursor ()
32+ updated = 0
33+
34+ for main_cat_dir in CATEGORIZED_DIR .iterdir ():
35+ print (f"Processing category: { main_cat_dir .name } " )
36+ if not main_cat_dir .is_dir () or main_cat_dir .name .startswith ('.' ):
37+ continue
38+ for tool_dir in main_cat_dir .iterdir ():
39+ if not tool_dir .is_dir () or tool_dir .name .startswith ('.' ):
40+ continue
41+ for md_file in tool_dir .glob ("*.md" ):
42+ metadata = extract_frontmatter (md_file )
43+ subcat = metadata .get ('sub_category' )
44+ if not subcat :
45+ continue
46+ norm_subcat = normalize_subcategory (subcat )
47+ filename = md_file .name
48+ main_category = main_cat_dir .name
49+ # Check if already up-to-date
50+ cur .execute (
51+ "SELECT sub_category FROM man_pages WHERE main_category = ? AND filename = ?" ,
52+ (main_category , filename )
53+ )
54+ row = cur .fetchone ()
55+ if row and row [0 ] == norm_subcat :
56+ print (f"Skipping (already up-to-date): main_category={ main_category } , filename={ filename } " )
57+ continue
58+
59+ print (f"Updating: main_category={ main_category } , filename={ filename } , new_sub_category={ norm_subcat } " )
60+ try :
61+ cur .execute (
62+ "UPDATE man_pages SET sub_category = ? WHERE main_category = ? AND filename = ?" ,
63+ (norm_subcat , main_category , filename )
64+ )
65+ if cur .rowcount > 0 :
66+ updated += 1
67+ except sqlite3 .IntegrityError as e :
68+ print (f"IntegrityError for file { filename } (main_category={ main_category } ): { e } " )
69+ continue
70+ conn .commit ()
71+ print (f"Updated { updated } rows." )
72+ conn .close ()
73+
74+ if __name__ == "__main__" :
75+ main ()
0 commit comments