feat: image tokenization

ritchieng · ritchieng · commit 953a43833105 · 2026-01-22T02:19:17.000Z
diff --git a/aigc/app.js b/aigc/app.js
@@ -31,6 +31,7 @@ async function loadPosts() {
         // Try to fetch a list of posts - we'll use a fallback approach
 const postFiles = [
             'hallucination-mitigation',
+            'image-tokenization',
             'intro-to-aigc',
             'tokenization-embeddings'
         ];
diff --git a/aigc/posts/image-tokenization.md b/aigc/posts/image-tokenization.md
@@ -0,0 +1,65 @@
+---
+title: "Image Tokenization"
+date: "2026-01-22"
+category: "AIGC"
+pinned: false
+excerpt: "How CLIP Bridges Pixels and Prose"
+---
+
+## How CLIP Bridges Pixels and Prose
+This post focuses specifically on **CLIP** (Contrastive Language-Image Pre-training), the breakthrough architecture from OpenAI that taught AI to "see" by reading and "read" by looking.
+
+Unlike standard LLMs that only understand text, CLIP creates a bridge between vision and language. Here's how it tokenizes images to match the language of text.
+
+For years, Computer Vision and Natural Language Processing (NLP) were two different worlds. Vision models looked for edges and textures; text models looked for grammar and syntax.
+
+CLIP changed everything. It proved that if you "tokenize" an image correctly, you can force it into the same mathematical space as text. This is the foundation of tools like DALL-E, Midjourney, and advanced semantic image search.
+
+## The "Two Towers" Architecture
+CLIP operates using two parallel encoders—often called the "Two Towers":
+
+- **The Text Tower**: A standard Transformer (like GPT) that tokenizes text using Byte Pair Encoding (BPE).
+- **The Image Tower**: A Vision Transformer (ViT) that "tokenizes" an image into a sequence of patches.
+
+The goal isn't just to understand them separately, but to project both into a shared **Multimodal Embedding Space**.
+
+## Tokenizing the Image: The Patching Strategy
+To make an image "look" like a sentence, CLIP doesn't look at individual pixels. Instead, it treats the image like a jigsaw puzzle:
+
+- **Patching**: The image (e.g., $224 \times 224$ pixels) is chopped into a grid of squares, usually $16 \times 16$ pixels each.
+- **Linear Projection**: Each patch is flattened into a vector. If a patch has $16 \times 16$ pixels with 3 color channels (RGB), it starts as 768 numbers.
+- **The "Class" Token**: Just as a text model adds a special token to represent the "entire sentence," CLIP adds a `[CLS]` token to the image sequence. This token eventually learns to represent the summary of the entire image.
+- **Position Embeddings**: Because the model needs to know that the "cloud" patch is above the "grass" patch, a unique "coordinate" vector is added to each patch.
+
+## The "Aha!" Moment: Contrastive Learning
+How do we make the vector for the word "Golden Retriever" look like the vector for a patch-grid of a dog? CLIP uses a training method called **Contrastive Learning**.
+
+Imagine a batch of $N$ images and $N$ captions:
+
+- **The Goal**: Maximize the cosine similarity between the correct pairs (Image A + Caption A).
+- **The Constraint**: Minimize the similarity between the incorrect pairs (Image A + Caption B).
+
+Mathematically, CLIP calculates the dot product of the image and text vectors. The model is essentially playing a massive game of "Match the Caption to the Photo."
+
+## Why This Matters: Zero-Shot Intelligence
+Traditional vision models were rigid. If you trained a model on "cats" and "dogs," it would fail if it saw a "panda."
+
+Because CLIP uses textual descriptions as its labels, it has **Zero-Shot capabilities**. You can give it an image of a "Cyberpunk-style neon city" (something it was never specifically trained to categorize) and because it understands the tokens for "neon," "city," and "cyberpunk," it can find that image in the vector space with incredible accuracy.
+
+## Practical Implementation: The Latent Space
+When you build an application using CLIP, you aren't storing images; you are storing **Image Embeddings** (typically 512 or 768 dimensions).
+
+- **Image Search**: You embed your entire photo library. When a user types "sunset at the beach," you embed that text and find the image vectors with the highest cosine similarity.
+- **Content Moderation**: You can check if an image embedding is mathematically close to the embedding of "prohibited content" tokens.
+
+## Summary: The Unified Language of Vectors
+| Feature | Text Tokenization | Image "Tokenization" (ViT) |
+|---------|-------------------|---------------------------|
+| Basic Unit | Sub-word (BPE) | $16 \times 16$ Pixel Patch |
+| Sequence | List of word IDs | Grid of patch vectors |
+| Final Output | Text Embedding (d-dim) | Visual Embedding (d-dim) |
+| The Bridge | Contrastive Loss forces them to match in the Latent Space. | |
+
+## What's Next?
+By bridging the gap between sight and language, CLIP allows machines to navigate the world more like we do—through concepts rather than just raw data.
+
diff --git a/aigc/posts/image-tokenization/app.js b/aigc/posts/image-tokenization/app.js
@@ -0,0 +1,129 @@
+// Parse YAML frontmatter from markdown
+function parseFrontmatter(content) {
+    const frontmatterRegex = /^---\n([\s\S]*?)\n---/;
+    const match = content.match(frontmatterRegex);
+    
+    if (!match) {
+        return { metadata: {}, content: content };
+    }
+    
+    const frontmatterStr = match[1];
+    const metadata = {};
+    
+    // Simple YAML parser
+    frontmatterStr.split('\n').forEach(line => {
+        const [key, ...valueParts] = line.split(':');
+        if (key && valueParts.length > 0) {
+            let value = valueParts.join(':').trim();
+            value = value.replace(/^["']|["']$/g, '');
+            metadata[key.trim()] = value;
+        }
+    });
+    
+    const bodyContent = content.replace(frontmatterRegex, '').trim();
+    return { metadata, content: bodyContent };
+}
+
+// Get the post filename from URL
+function getPostFilename() {
+    const path = window.location.pathname;
+    const parts = path.split('/');
+    // Should be something like /aigc/posts/post-name/
+    for (let i = 0; i < parts.length; i++) {
+        if (parts[i] === 'posts' && i + 1 < parts.length) {
+            return parts[i + 1];
+        }
+    }
+    return null;
+}
+
+// Format date
+function formatDate(dateStr) {
+    try {
+        const [year, month, day] = dateStr.split('-');
+        const date = new Date(year, month - 1, day);
+        return date.toLocaleDateString('en-US', { day: 'numeric', month: 'short', year: 'numeric' });
+    } catch (e) {
+        return dateStr;
+    }
+}
+
+// Load and render the post
+async function loadPost() {
+    const filename = getPostFilename();
+    if (!filename) {
+        document.getElementById('post-container').innerHTML = '<p>Post not found</p>';
+        return;
+    }
+
+    console.log('Loading post:', filename);
+
+    try {
+        // Fetch the markdown file from the posts directory (one level up from current post dir)
+        const response = await fetch(`/aigc/posts/${filename}.md`);
+        
+        if (!response.ok) {
+            console.error('Failed to fetch markdown:', response.status, response.statusText);
+            document.getElementById('post-container').innerHTML = `<p>Could not load post (${response.status})</p>`;
+            return;
+        }
+
+        const markdownContent = await response.text();
+        console.log('Markdown loaded, length:', markdownContent.length);
+        
+        const { metadata, content } = parseFrontmatter(markdownContent);
+        console.log('Metadata:', metadata);
+
+        // Update page title
+        if (metadata.title) {
+            document.title = `ritchie@singapore~$ ${metadata.title}`;
+        }
+
+        // Convert markdown to HTML - ensure marked is available
+        let htmlContent;
+        console.log('marked available:', typeof marked !== 'undefined');
+        
+        if (typeof marked !== 'undefined' && marked.parse) {
+            try {
+                htmlContent = marked.parse(content);
+                console.log('Markdown parsed successfully, HTML length:', htmlContent.length);
+            } catch (e) {
+                console.error('Error parsing markdown:', e);
+                htmlContent = `<pre>${content}</pre>`;
+            }
+        } else {
+            console.warn('marked.js not loaded, showing raw markdown');
+            htmlContent = `<pre>${content}</pre>`;
+        }
+
+        // Build the post header
+        const headerHTML = `
+            <div class="post-header">
+                <h1 class="post-title">${metadata.title || 'Untitled'}</h1>
+                <div class="post-meta">
+                    <div class="post-meta-item">
+                        <span class="post-meta-label">Published:</span> ${formatDate(metadata.date || 'Unknown')}
+                    </div>
+                    <div class="post-meta-item">
+                        <span class="post-meta-label">Category:</span> <span style="color: #00ff88;">${metadata.category || 'AIGC'}</span>
+                    </div>
+                </div>
+            </div>
+        `;
+
+        // Insert the header and content
+        const container = document.getElementById('post-container');
+        container.innerHTML = headerHTML + htmlContent;
+
+        // Re-render MathJax if it's loaded
+        if (typeof MathJax !== 'undefined' && MathJax.typesetPromise) {
+            MathJax.typesetPromise([container]).catch(err => console.log('MathJax error:', err));
+        }
+
+    } catch (error) {
+        console.error('Error loading post:', error);
+        document.getElementById('post-container').innerHTML = `<p>Error loading post: ${error.message}</p>`;
+    }
+}
+
+document.addEventListener('DOMContentLoaded', loadPost);
diff --git a/aigc/posts/image-tokenization/index.html b/aigc/posts/image-tokenization/index.html
@@ -0,0 +1,195 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>ritchie@singapore~$ AIGC Post</title>
+  
+  <!-- Preload critical resources -->
+  <link rel="preload" href="https://fonts.googleapis.com/css2?family=Fira+Code&display=swap" as="style" onload="this.onload=null;this.rel='stylesheet'">
+  <noscript><link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Fira+Code&display=swap"></noscript>
+  
+  <!-- Critical CSS loaded synchronously -->
+  <link rel="stylesheet" href="/css/styles.css">
+  
+  <!-- Load Marked.js for markdown rendering - load before app.js -->
+  <script src="/lib/marked/marked.min.js"></script>
+
+  <!-- Defer heavy scripts to prevent render blocking -->
+  <script src="/lib/tailwind/tailwind-cdn.js" defer></script>
+
+  <!-- MathJax Configuration - defer loading -->
+  <script>
+    window.MathJax = {
+      tex: { inlineMath: [['$', '$'], ['\\(', '\\)']] },
+      svg: { fontCache: 'global' }
+    };
+  </script>
+  <script src="/lib/mathjax/tex-mml-chtml.js" defer></script>
+  
+  <style>
+    .post-content {
+      color: #efefef;
+      line-height: 1.8;
+    }
+
+    .post-content h1,
+    .post-content h2,
+    .post-content h3,
+    .post-content h4,
+    .post-content h5,
+    .post-content h6 {
+      color: #f1c40f;
+      margin-top: 1.5rem;
+      margin-bottom: 0.5rem;
+      font-weight: bold;
+    }
+
+    .post-content h1 { font-size: 2rem; }
+    .post-content h2 { font-size: 1.5rem; border-bottom: 1px solid #333; padding-bottom: 0.5rem; }
+    .post-content h3 { font-size: 1.25rem; }
+
+    .post-content p {
+      margin-bottom: 1rem;
+    }
+
+    .post-content code {
+      background-color: #1a1a1a;
+      color: #00ff88;
+      padding: 0.2rem 0.4rem;
+      border-radius: 3px;
+      font-size: 0.9em;
+      font-family: 'Fira Code', monospace;
+    }
+
+    .post-content pre {
+      background-color: #1a1a1a;
+      border-left: 3px solid #00ff88;
+      padding: 1rem;
+      border-radius: 4px;
+      overflow-x: auto;
+      margin: 1rem 0;
+      font-family: 'Fira Code', monospace;
+    }
+
+    .post-content pre code {
+      background-color: transparent;
+      color: #00ff88;
+      padding: 0;
+      border-radius: 0;
+    }
+
+    .post-content blockquote {
+      border-left: 4px solid #00ccff;
+      padding-left: 1rem;
+      color: #b0b0b0;
+      margin: 1rem 0;
+      font-style: italic;
+    }
+
+    .post-content ul,
+    .post-content ol {
+      margin-left: 2rem;
+      margin-bottom: 1rem;
+    }
+
+    .post-content li {
+      margin-bottom: 0.5rem;
+    }
+
+    .post-content a {
+      color: #00ccff;
+      text-decoration: underline;
+    }
+
+    .post-content a:hover {
+      color: #00ff88;
+    }
+
+    .post-content table {
+      border-collapse: collapse;
+      width: 100%;
+      margin: 1rem 0;
+      border: 1px solid #333;
+    }
+
+    .post-content table th,
+    .post-content table td {
+      border: 1px solid #333;
+      padding: 0.75rem;
+      text-align: left;
+    }
+
+    .post-content table th {
+      background-color: #1a1a1a;
+      color: #f1c40f;
+      font-weight: bold;
+    }
+
+    .post-content table tr:hover {
+      background-color: #0a0a0a;
+    }
+
+    .post-header {
+      padding-bottom: 1.5rem;
+      margin-bottom: 2rem;
+    }
+
+    .post-header::after {
+      content: '';
+      display: block;
+      width: 40px;
+      height: 1px;
+      background: #00ff88;
+      margin-top: 1rem;
+    }
+
+    .post-title {
+      color: #f1c40f;
+      font-size: 2.5rem;
+      margin-bottom: 0;
+      margin-top: 0;
+      font-weight: 600;
+      letter-spacing: -0.5px;
+    }
+
+    .post-meta {
+      display: flex;
+      gap: 2rem;
+      flex-wrap: wrap;
+      margin-top: 1rem;
+      font-size: 0.95rem;
+    }
+
+    .post-meta-item {
+      color: #00ccff;
+      font-size: 0.9rem;
+      font-family: 'Fira Code', monospace;
+    }
+
+    .post-meta-label {
+      color: #989898;
+      font-weight: 500;
+    }
+  </style>
+</head>
+
+<body>
+  <div class="container">
+    <div id="menu"></div>
+
+    <div id="post-container" class="post-content">
+      <!-- Post content will be loaded here -->
+    </div>
+
+    <div class="bash-line" style="margin-top: 2rem;">
+      <span class="prompt">ritchie@singapore</span>:<span class="command">~/aigc/posts</span>$ <span class="cursor">|</span>
+    </div>
+  </div>
+
+  <!-- Load marked.js synchronously before app.js -->
+  <script src="/lib/marked/marked.min.js"></script>
+  <script src="/js/menu.js" defer></script>
+  <script src="./app.js" defer></script>
+</body>
+</html>