feat: new tokenization embedding and pinned posts

ritchieng · ritchieng · commit 9cdf2021f222 · 2026-01-18T16:14:33.000+11:00
diff --git a/aigc/app.js b/aigc/app.js
@@ -31,7 +31,8 @@ async function loadPosts() {
         // Try to fetch a list of posts - we'll use a fallback approach
 const postFiles = [
             'hallucination-mitigation',
-            'intro-to-aigc'
+            'intro-to-aigc',
+            'tokenization-embeddings'
         ];
         
         const posts = [];
@@ -51,6 +52,7 @@ const postFiles = [
                         date: metadata.date || 'Unknown Date',
                         excerpt: metadata.excerpt || 'No description available',
                         category: metadata.category || 'general',
+                        pinned: metadata.pinned === 'true' || metadata.pinned === true,
                         filename: filename,
                         link: `./posts/${filename}/`
                     });
@@ -60,6 +62,19 @@ const postFiles = [
             }
         }
         
+        // Sort posts: pinned first (by date), then unpinned (by date)
+        posts.sort((a, b) => {
+            // If one is pinned and one isn't, pinned comes first
+            if (a.pinned !== b.pinned) {
+                return a.pinned ? -1 : 1;
+            }
+            
+            // Both pinned or both unpinned: sort by date (newest first)
+            const dateA = new Date(a.date);
+            const dateB = new Date(b.date);
+            return dateB - dateA;
+        });
+        
         return posts;
     } catch (err) {
         console.error('Error loading posts:', err);
@@ -83,10 +98,14 @@ function formatDate(dateStr) {
 }
 
 function createPostCard(post) {
+    const pinnedBadge = post.pinned ? '<span class="text-xs text-[#ff00ff] bg-[#ff00ff]/10 px-3 py-1 rounded-full font-bold ml-2">📌 PINNED</span>' : '';
     return `
-    <article class="bg-gray-900/80 p-6 rounded-lg border-l-4 border-[#00ff88] hover:border-[#00ccff] shadow-lg shadow-blue-500/10 transition-all duration-300 transform hover:-translate-y-1">
+    <article class="bg-gray-900/80 p-6 rounded-lg border-l-4 ${post.pinned ? 'border-[#ff00ff]' : 'border-[#00ff88]'} hover:border-[#00ccff] shadow-lg shadow-blue-500/10 transition-all duration-300 transform hover:-translate-y-1">
         <div class="flex justify-between items-start mb-3">
-            <span class="text-xs text-[#00ccff] bg-[#00ccff]/10 px-3 py-1 rounded-full font-bold">${post.category}</span>
+            <div class="flex gap-2">
+                <span class="text-xs text-[#00ccff] bg-[#00ccff]/10 px-3 py-1 rounded-full font-bold">${post.category}</span>
+                ${pinnedBadge}
+            </div>
             <time class="text-sm text-gray-500 font-mono">${formatDate(post.date)}</time>
         </div>
         <h2 class="text-xl font-semibold mb-3 text-[#f1c40f] hover:text-[#00ff88] transition">
diff --git a/aigc/posts/intro-to-aigc.md b/aigc/posts/intro-to-aigc.md
@@ -3,6 +3,7 @@ title: "Introduction to AI Generated Content (AIGC) Section"
 date: "2026-01-15"
 category: "general"
 excerpt: "To Log Random Musings with AI"
+pinned: true
 ---
 
 ## Why I Started This
diff --git a/aigc/posts/tokenization-embeddings.md b/aigc/posts/tokenization-embeddings.md
@@ -0,0 +1,84 @@
+---
+title: "Tokenization & Embeddings"
+date: "2026-01-18"
+category: "tokenization-embeddings"
+excerpt: "From Words to Vectors: Tokenization and Embeddings"
+---
+
+## From Words to Vectors: Tokenization and Embeddings
+
+If LLMs are the "engine" of modern AI, then **Tokenization** and **Embeddings** are the fuel. Before a model can reason, summarize, or code, it must first translate human language into a language it understands: **high-dimensional mathematics.**
+
+Understanding this bridge is crucial for anyone building AI agents, optimizing RAG pipelines, or managing API costs.
+
+## 1. Tokenization: Breaking Language into Bricks
+
+Tokenization is the process of chopping a string of text into smaller units called **tokens**. Think of tokens as the "atomic units" of processing.
+
+### **The Three Levels of Tokenization**
+
+1. **Word-level:** Splitting by spaces. (Simple, but fails on "running" vs "runner").
+2. **Character-level:** Splitting every letter. (Too granular; the model loses context).
+3. **Subword-level (The Standard):** Models like GPT-4 use **Byte Pair Encoding (BPE)**. It breaks common words into one token (e.g., "apple") but splits rare words into pieces (e.g., "hallucination" becomes "hallucin" + "ation").
+
+### **Why It Matters:**
+
+* **The 75% Rule:** In English, 1,000 tokens are roughly equivalent to 750 words.
+* **Context Windows:** Models have a "memory limit" (e.g., 128k tokens). If your tokenizer is inefficient, you hit that limit faster.
+* **Cost:** You are billed by the token. Understanding how your text tokenizes helps you estimate spend and optimize prompts.
+
+## 2. Embeddings: Giving Words a "Map"
+
+Once we have tokens, the model assigns each one a unique ID. But a list of IDs (e.g., `45, 102, 33`) doesn't tell the model that "dog" is related to "puppy."
+
+This is where **Embeddings** come in. An embedding is a numerical representation of a token in a high-dimensional vector space.
+
+### **The Semantic Space**
+
+Imagine a 3D map where words with similar meanings are physically close to each other.
+
+* "Apple" and "Banana" are close together.
+* "Apple" and "Laptop" are slightly further apart (unless discussing tech).
+* "Apple" and "Justice" are very far apart.
+
+In reality, modern embeddings don't use 3 dimensions—they use **thousands** (e.g., 1,536 dimensions for OpenAI’s `text-embedding-3-small`). Each dimension represents a "feature" of the word that the model learned during training.
+
+### **The Magic of Vector Math**
+
+Because these are numbers, we can perform math on them. The classic example:
+
+`Vector("King") − Vector("Man") + Vector("Woman") ≈ Vector("Queen")`
+
+## 3. How They Work Together: The Pipeline
+
+Here is the journey of a user query through an AI system:
+
+1. **Input:** "How do I fix a leaky faucet?"
+2. **Tokenization:** The string is split into tokens: `["How", " do", " I", " fix", " a", " leaky", " fauc", "et", "?"]`.
+3. **Lookup:** The model looks up the **Embedding** for each token.
+4. **Attention Layer:** The model looks at the vectors and realizes "leaky" is modifying "faucet," creating a combined understanding of the query.
+5. **Output:** The model generates the next most likely token vector and turns it back into a word.
+
+## 4. Practical Implementation: When to Care
+
+If you are an AI architect, you will encounter these concepts in two main areas:
+
+### **A. Choosing an Embedding Model**
+
+Not all embeddings are equal. You need to balance **Performance vs. Latency**.
+
+* **Proprietary (OpenAI/Gemini):** Extremely high performance, but you pay per request and data leaves your server.
+* **Open Source (Qwen3/Gemma):** Can be hosted locally (good for privacy), but requires your own GPU infrastructure.
+
+### **B. Vector Databases (The RAG Connection)**
+
+When you build a Knowledge Base, you are essentially storing thousands of embeddings.
+
+* **The Process:** You "embed" your entire document library.
+* **The Search:** When a user asks a question, you embed the *question* and find the document vectors that are physically closest to it in the vector space. This is called **Cosine Similarity** as an example, there are many other similarity measures beyond this.
+
+## Conclusion: The Math of Meaning
+
+Tokenization and embeddings are why AI feels "human." By turning language into a spatial map, we allow machines to understand nuances, synonyms, and relationships that traditional keyword search could never touch.
+
+If you are building an agent, remember: **Better embeddings lead to better retrieval, and better tokenization leads to better efficiency.**
diff --git a/aigc/posts/tokenization-embeddings/app.js b/aigc/posts/tokenization-embeddings/app.js
@@ -0,0 +1,129 @@
+// Parse YAML frontmatter from markdown
+function parseFrontmatter(content) {
+    const frontmatterRegex = /^---\n([\s\S]*?)\n---/;
+    const match = content.match(frontmatterRegex);
+    
+    if (!match) {
+        return { metadata: {}, content: content };
+    }
+    
+    const frontmatterStr = match[1];
+    const metadata = {};
+    
+    // Simple YAML parser
+    frontmatterStr.split('\n').forEach(line => {
+        const [key, ...valueParts] = line.split(':');
+        if (key && valueParts.length > 0) {
+            let value = valueParts.join(':').trim();
+            value = value.replace(/^["']|["']$/g, '');
+            metadata[key.trim()] = value;
+        }
+    });
+    
+    const bodyContent = content.replace(frontmatterRegex, '').trim();
+    return { metadata, content: bodyContent };
+}
+
+// Get the post filename from URL
+function getPostFilename() {
+    const path = window.location.pathname;
+    const parts = path.split('/');
+    // Should be something like /aigc/posts/post-name/
+    for (let i = 0; i < parts.length; i++) {
+        if (parts[i] === 'posts' && i + 1 < parts.length) {
+            return parts[i + 1];
+        }
+    }
+    return null;
+}
+
+// Format date
+function formatDate(dateStr) {
+    try {
+        const [year, month, day] = dateStr.split('-');
+        const date = new Date(year, month - 1, day);
+        return date.toLocaleDateString('en-US', { day: 'numeric', month: 'short', year: 'numeric' });
+    } catch (e) {
+        return dateStr;
+    }
+}
+
+// Load and render the post
+async function loadPost() {
+    const filename = getPostFilename();
+    if (!filename) {
+        document.getElementById('post-container').innerHTML = '<p>Post not found</p>';
+        return;
+    }
+
+    console.log('Loading post:', filename);
+
+    try {
+        // Fetch the markdown file from the posts directory (one level up from current post dir)
+        const response = await fetch(`/aigc/posts/${filename}.md`);
+        
+        if (!response.ok) {
+            console.error('Failed to fetch markdown:', response.status, response.statusText);
+            document.getElementById('post-container').innerHTML = `<p>Could not load post (${response.status})</p>`;
+            return;
+        }
+
+        const markdownContent = await response.text();
+        console.log('Markdown loaded, length:', markdownContent.length);
+        
+        const { metadata, content } = parseFrontmatter(markdownContent);
+        console.log('Metadata:', metadata);
+
+        // Update page title
+        if (metadata.title) {
+            document.title = `ritchie@singapore~$ ${metadata.title}`;
+        }
+
+        // Convert markdown to HTML - ensure marked is available
+        let htmlContent;
+        console.log('marked available:', typeof marked !== 'undefined');
+        
+        if (typeof marked !== 'undefined' && marked.parse) {
+            try {
+                htmlContent = marked.parse(content);
+                console.log('Markdown parsed successfully, HTML length:', htmlContent.length);
+            } catch (e) {
+                console.error('Error parsing markdown:', e);
+                htmlContent = `<pre>${content}</pre>`;
+            }
+        } else {
+            console.warn('marked.js not loaded, showing raw markdown');
+            htmlContent = `<pre>${content}</pre>`;
+        }
+
+        // Build the post header
+        const headerHTML = `
+            <div class="post-header">
+                <h1 class="post-title">${metadata.title || 'Untitled'}</h1>
+                <div class="post-meta">
+                    <div class="post-meta-item">
+                        <span class="post-meta-label">Published:</span> ${formatDate(metadata.date || 'Unknown')}
+                    </div>
+                    <div class="post-meta-item">
+                        <span class="post-meta-label">Category:</span> <span style="color: #00ff88;">${metadata.category || 'AIGC'}</span>
+                    </div>
+                </div>
+            </div>
+        `;
+
+        // Insert the header and content
+        const container = document.getElementById('post-container');
+        container.innerHTML = headerHTML + htmlContent;
+
+        // Re-render MathJax if it's loaded
+        if (typeof MathJax !== 'undefined' && MathJax.typesetPromise) {
+            MathJax.typesetPromise([container]).catch(err => console.log('MathJax error:', err));
+        }
+
+    } catch (error) {
+        console.error('Error loading post:', error);
+        document.getElementById('post-container').innerHTML = `<p>Error loading post: ${error.message}</p>`;
+    }
+}
+
+document.addEventListener('DOMContentLoaded', loadPost);
diff --git a/aigc/posts/tokenization-embeddings/index.html b/aigc/posts/tokenization-embeddings/index.html
diff --git a/manage.sh b/manage.sh