paulwang1905.github.io/build.py at main · PaulWang1905/paulwang1905.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
import subprocess
import os
import markdown
from jinja2 import Template, Environment, FileSystemLoader
from markupsafe import Markup, escape
from datetime import datetime
import json
from pathlib import Path
import shutil
from updates import UpdateReader
from datetime import datetime
import frontmatter

# Function to build CSS using Tailwind CSS

# Read the template only once
# The template location: oxie/src/template.html
# Set up Jinja2 environment
env = Environment(loader=FileSystemLoader('src'))

# Metadata for the index page, this is a dictionary.
# The metadata is used in the index.html template.
# The metadata json file is located in oxie/src/meta_data.json

# read the metadata only once from /src/meta_data.json
meta_data_dir = "src/meta_data.json"
with open(meta_data_dir) as f:
    meta_data = json.load(f)


# read the template only once
template = env.get_template("template.html")
# index template is used to render the index page
index_template = env.get_template("index.html")
# category template is used to render a page for each category
category_template = env.get_template("category_template.html")
# blog index template is used to render the blog index page (for all categories)
blog_index_template = env.get_template("blog_template.html")
readings_note_template = env.get_template("readings_note_template.html")
Markdown_Extenstions = ['pymdownx.tilde', 'pymdownx.emoji', 'tables', 'meta', 'footnotes', 'md_in_html', 'extra']

posts = []
pages = []
categories = []

# Directories to collect static files from
# The key is the source directory and the value is the target directory
# The target directory and content will not delete if it exists
collect_dirs = {
    'source/image': 'docs/image',
    'source/static': 'docs/page',
}


# Function to build CSS using Tailwind CSS
def build_css():
    subprocess.run(["npm", "run", "build:css"], check=True)

def get_updates():
    '''
    Get updates from the Google Spreadsheet
    '''
    spreedsheet_id = meta_data["update_spreedsheet_id"]
    updates = UpdateReader(
            spreadsheet_id=spreedsheet_id
        )


    updates.load_from_spreadsheet()
    return updates


class POST:
    '''
    Class to represent a blog post or page.
    post and page are the same, but there will be in different directories and different areas in index.html.
    '''
    def __init__(self, md_path, html_path):
        self.md_path = md_path
        self.html_path = html_path
        self.title = None
        self.content = None
        self.date = None
        self.author = None
        self.category = None
        self.tags = []
        self.read_time = None
        self.link = None
        self.summary = None
        self.full_link = None
        # Image is the cover image for the post, it is a relative path to the image
        # It is used in the category page and the blog index page
        # The default image is the image in the meta_data.json
        self.image = meta_data["image"]
        self.post_meta_data = None

    def parse(self) -> None:
        '''
        Parse the markdown file and extract metadata and content
        '''
        with open(self.md_path, 'r') as md_file:
            # use markupsafe to escape html content
            # md_content = Markup(md_file.read())
            # md_content = Markup(escape(md_file.read()))
            # not using markupsafe to escape html content for the moment because it will escape the blockquote in markdown.
            # I will find a way to escape the html content in the future.
            md_content = md_file.read()
            md = markdown.Markdown(extensions = Markdown_Extenstions)
            html_content = md.convert(md_content)
            self.content = html_content
            # get metadata from the md file
            # self.post_meta_data = md.Meta

            self.post_meta_data = frontmatter.loads(md_content).metadata
            # print(self.post_meta_data)

            # get the relative link
            self.link = self.html_path[4:]
            # get the link with domain name
            self.full_link = meta_data["link"] + self.link

            try:
                self.title = self.post_meta_data["Title"]
                self.author = self.post_meta_data["Authors"]
                self.summary = self.post_meta_data["Summary"]
                self.category = self.post_meta_data["Category"]
                self.date = self.post_meta_data['Date']
                # read metadata for last modified date LastModified is optional
                # the default last modified date is the date of the post

                self.last_modified = self.post_meta_data.get('Last_modified', self.date)
                print(f"Processing {self.md_path} with last modified date {self.last_modified}")
                # Parse tags from metadata
                # Markdown Meta extension returns tags as a list
                # Format: "Tags: [tag1, tag2]" becomes ['[tag1, tag2]']
                # We need to strip brackets, split by comma, and strip whitespace
                self.tags = self.post_meta_data.get('Tags', [])


                # get the image from the metadata, if not found use the default image from the meta_data.json
                self.image = self.post_meta_data.get("Image", self.image)
            except KeyError:
                print(f"Metadata not found in {self.md_path}")
                pass


    def render(self) -> None:
        '''
        Render the post to HTML
        '''
        rendered_html = template.render(
            meta_data=meta_data,
            post_meta_data=self.post_meta_data,
            title=self.title,
            author=self.author,
            summary=self.summary,
            category=self.category,
            date=self.date,
            last_modified=self.last_modified,
            content=self.content,
            phrases=meta_data["phrases"],
            image=self.image,
            tags=self.tags,
            link=self.link,
        )
        with open(self.html_path, "w") as html_file:
            html_file.write(rendered_html)

class CATEGORY:
    '''
    Class to represent a category page, render a page for each category
    Also contains a list of all tags for the category
    '''
    def __init__(self, meta_data, posts, category) -> None:
        self.meta_data = meta_data
        self.posts = posts
        self.category = category
        self.tags = [tag for post in self.posts for tag in post.tags]
        self.link = f"blog_{category}.html"
        self.count = len(self.posts)
        self.last_post = self.posts[0]

    def render(self) -> None:
        '''
        Render the category page
        '''
        rendered_html = category_template.render(
            title=self.meta_data["title"],
            phrases=self.meta_data["phrases"],
            posts=self.posts,
            category=self.category,
            tags=self.tags
        )
        with open(f"docs/blog_{self.category}.html", "w") as html_file:
            html_file.write(rendered_html)

class BLOG_INDEX:
    '''
    Class to represent the blog index page and render a page for each category
    '''

    def __init__(self, meta_data, posts) -> None:
        self.meta_data = meta_data
        self.posts = posts
        self.category_list = set([post.category for post in self.posts])
        # Flatten and collect all valid tags from posts
        self.tags = set([
            tag
            for post in self.posts
            for tag in (post.tags if post.tags is not None else [])
        ])


    def render(self) -> None:
        '''
        Render the blog index page, render a page for each category
        '''

        for category in self.category_list:
            # Filter posts for current category
            category_posts = [post for post in self.posts if post.category == category]
            category_tags = [tag for post in category_posts for tag in post.tags]
            category = CATEGORY(self.meta_data, category_posts, category)
            category.render()
            categories.append(category)

        # Render blog index page for all categories

        rendered_html = blog_index_template.render(
            title=self.meta_data["title"],
            phrases=self.meta_data["phrases"],
            categories=categories,
        )
        with open("docs/blog_index.html", "w") as html_file:
            html_file.write(rendered_html)


class INDEX:
    '''
    Class to represent the index page
    '''
    def __init__(self, meta_data, posts, pages) -> None:
        '''
        Initialize the index page, read meta data (dictionary) and HTML directory (list of string)
        '''
        self.meta_data = meta_data
        self.posts = posts
        self.pages = pages
        # Terms of Service and Privacy Policy are not included in the index page
        self.pages = [page for page in self.pages if page.title not in ["Terms of Service", "Privacy Policy"]]

        # for page in self.pages:
        #     if page.title in ["Terms of Service", "Privacy Policy"]:
        #         self.pages.remove(page)
        #         print(f"Removing {page.title} from index page")
        # Initialize metadata for the index page


        self.description = None
        self.content = None


    def parse(self) -> None:
        '''
        Parse the index page, index.md in the source directory
        '''
        with open("source/index.md", 'r') as md_file:
            md_content = md_file.read()
            md = markdown.Markdown(extensions = Markdown_Extenstions)
            html_content = md.convert(md_content)
            # get metadata from the md file
            self.post_meta_data = md.Meta
            self.content = html_content
            try:
                self.meta_data["title"] = self.post_meta_data.get("title", ["Untitled"])[0]
                self.meta_data["author"] = self.post_meta_data.get("authors", ["Anonymous"])[0]
                self.meta_data["date"] = datetime.strptime(
                    self.post_meta_data.get("date", [datetime.now().strftime('%Y-%m-%d')])[0],
                    '%Y-%m-%d'
                )
                self.meta_data["description"] = self.meta_data.get("description", ["No description available"])[0]

            except KeyError:
                print(f"Metadata not found in index.md")


    def render(self) -> None:
        '''
        Render the index page
        '''
        # Get the updates from the Google Spreadsheet
        updates = get_updates().get_recent_updates()

        rendered_html = index_template.render(
            title=self.meta_data["title"],
            author=self.meta_data["author"],
            date=self.meta_data["date"],
            description=self.meta_data["description"],
            phrases=self.meta_data["phrases"],
            content=self.content,
            posts=self.posts,
            pages=self.pages,
            updates=updates
        )
        with open("docs/index.html", "w") as html_file:
            html_file.write(rendered_html)


# Function to convert Markdown files to HTML
def generate_posts_jsonld(posts, meta_data) -> dict:
    """
    Generate JSON-LD structured data for blog posts using Schema.org vocabulary

    Args:
        posts: List of POST objects
        meta_data: Dictionary containing site metadata

    Returns:
        dict: JSON-LD structured data
    """
    return {
        "@context": "https://schema.org",
        "@type": "ItemList",
        "name": f"{meta_data['title']} - Blog Posts",
        "description": f"Collection of blog posts from {meta_data['title']}",
        "numberOfItems": len(posts),
        "itemListElement": [
            {
                "@type": "BlogPosting",
                "@id": post.full_link,
                "headline": post.title,
                "author": {
                    "@type": "Person",
                    "name": post.author
                },
                "datePublished": post.date.strftime('%Y-%m-%d'),
                "dateModified": post.last_modified.strftime('%Y-%m-%d') if post.last_modified else post.date.strftime('%Y-%m-%d'),
                "description": post.summary,
                "about": {
                    "@type": "DefinedTerm",
                    "name": post.category,
                    "inDefinedTermSet": {
                        "@type": "DefinedTermSet",
                        "name": "Blog Categories"
                    }
                },
                "keywords": post.tags,
                "url": post.full_link,
                "publisher": {
                    "@type": "Organization",
                    "name": meta_data["title"],
                    "url": meta_data["link"]
                },
                "mainEntityOfPage": {
                    "@type": "WebPage",
                    "@id": post.full_link
                }
            } for post in posts
        ]
    }

def generate_html() -> None:
    '''
    Generate HTML files from Markdown files,
    only for files in source/page and source/post, does not include index.md
    to the docs directory
    '''
    directories = ["source/page", "source/post"]
    source_root = Path("source")
    docs_root = Path("docs")


    for directory in directories:
        dir_path = Path(directory)
        # print the work being done
        print(f"Processing Folder: {dir_path}")
        for md_path in dir_path.rglob('*.md'):
            # Calculate the relative path from the source root
            relative_md_path = md_path.relative_to(source_root)
            # Construct the corresponding HTML path in the docs directory
            html_path = (docs_root / relative_md_path).with_suffix('.html')
            # Ensure the parent directory exists
            html_path.parent.mkdir(parents=True, exist_ok=True)
            # Create and process the POST object
            post = POST(str(md_path), str(html_path))
            post.parse()
            if directory == "source/post":
                posts.append(post)
            else:
                pages.append(post)
            post.render()
    # sort the order of posts to show the latest post first
    # posts.sort(key=lambda post: (-post.date.timestamp(), post.title))

    posts.sort(key=lambda post: (
            -datetime.combine(post.date, datetime.min.time()).timestamp(),
            post.title
        ))

    # Output JSON-LD file with linked data schema for posts
    json_ld_data = generate_posts_jsonld(posts, meta_data)
    with open("docs/posts_metadata.jsonld", "w") as json_file:
        json.dump(json_ld_data, json_file, indent=2, ensure_ascii=False)
    print(f"Posts metadata written to docs/posts_metadata.jsonld ({len(posts)} posts)")

    # Blog index page
    blog_index = BLOG_INDEX(meta_data, posts)
    blog_index.render()
    index = INDEX(meta_data, posts, pages)
    index.parse()
    index.render()

def clean_old_files() -> None:
    '''
    Clean old HTML files
    '''
    print("Cleaning old files")
    docs_dir = Path("docs")
    # Remove all .html files in docs directory and subdirectories
    for html_file in docs_dir.rglob('*.html'):
        html_file.unlink()
    print("Old files cleaned")
    # Remove styles.css if it exists
    styles_css = docs_dir / "styles.css"
    if styles_css.exists():
        styles_css.unlink()
    print("styles.css removed")
    # Remove image directory if it exists
    images_dir = docs_dir / "image"
    if images_dir.exists():
        for image in images_dir.rglob('*'):
            image.unlink()
        images_dir.rmdir()
        print("images directory removed")


def collect_static_files(static_dirs: dict = None) -> None:
    """
    Collect static files from specified directories without deleting the target directory.

    Args:
        static_dirs (dict): Dictionary mapping source directories to target directories.
                            Defaults to {'source/image': 'docs/image'}.
    """
    if static_dirs is None:
        static_dirs = {'source/image': 'docs/image'}

    for source, target in static_dirs.items():
        source_dir = Path(source)
        target_dir = Path(target)
        print(f"Copying files from {source_dir} to {target_dir}")

        if not source_dir.exists():
            print(f"Source directory {source_dir} does not exist")
            continue

        # Ensure the target directory exists
        target_dir.mkdir(parents=True, exist_ok=True)

        try:
            for item in source_dir.iterdir():
                source_item = source_dir / item.name
                target_item = target_dir / item.name

                if source_item.is_dir():
                    # Recursively copy subdirectories
                    shutil.copytree(source_item, target_item, dirs_exist_ok=True)
                else:
                    # Overwrite files if they exist
                    shutil.copy2(source_item, target_item)
            print("Files copied successfully")
        except Exception as e:
            print(f"Error copying files: {e}")
            raise

def render_reading_notes() -> None:
    '''
    Render notes from readings_note.html
    '''
    print("Building notes")
    readings_note_html = readings_note_template.render(
        meta_data=meta_data,
        phrases=meta_data["phrases"]
    )
    with open("docs/readings_note.html", "w") as f:
        f.write(readings_note_html)
    print("Notes built successfully")

if __name__ == "__main__":
    clean_old_files()
    generate_html()
    render_reading_notes()
    collect_static_files(collect_dirs)
    build_css()