|
| 1 | +import React, { useState } from 'react'; |
| 2 | + |
| 3 | +export default function Step1Tokenization({ onComplete, onNext }) { |
| 4 | + const [inputText, setInputText] = useState("Hello, GPT-2!"); |
| 5 | + const [tokens, setTokens] = useState([]); |
| 6 | + const [quizAnswer, setQuizAnswer] = useState(''); |
| 7 | + const [quizFeedback, setQuizFeedback] = useState(''); |
| 8 | + |
| 9 | + // Simplified BPE tokenization simulation |
| 10 | + const tokenize = (text) => { |
| 11 | + // Simple word-level tokenization for demonstration |
| 12 | + const simpleTokens = text.split(/(\s+|[,.!?])/g).filter(t => t.trim()); |
| 13 | + setTokens(simpleTokens); |
| 14 | + }; |
| 15 | + |
| 16 | + const checkQuiz = () => { |
| 17 | + const correct = quizAnswer.toLowerCase().includes('subword'); |
| 18 | + setQuizFeedback(correct |
| 19 | + ? '✓ Correct! BPE breaks text into subword units, allowing the model to handle unknown words.' |
| 20 | + : '✗ Try again. Think about how BPE handles rare or unknown words.' |
| 21 | + ); |
| 22 | + if (correct) onComplete(); |
| 23 | + }; |
| 24 | + |
| 25 | + return ( |
| 26 | + <div className="space-y-8"> |
| 27 | + <div> |
| 28 | + <h2 className="text-3xl font-bold mb-2">Step 1: Tokenization & Embeddings</h2> |
| 29 | + <p className="text-gray-400">How text becomes numbers that GPT-2 can understand</p> |
| 30 | + </div> |
| 31 | + |
| 32 | + {/* Explanation */} |
| 33 | + <div className="bg-gray-800 rounded-lg p-6 space-y-4"> |
| 34 | + <h3 className="text-xl font-semibold text-emerald-400">What is Tokenization?</h3> |
| 35 | + <p className="text-gray-300"> |
| 36 | + GPT-2 can't process raw text - it needs numbers. <strong>Tokenization</strong> converts text into a sequence of tokens (subword units). |
| 37 | + </p> |
| 38 | + <p className="text-gray-300"> |
| 39 | + GPT-2 uses <strong>Byte-Pair Encoding (BPE)</strong> with a vocabulary of 50,257 tokens. This allows it to: |
| 40 | + </p> |
| 41 | + <ul className="list-disc list-inside space-y-1 text-gray-300 ml-4"> |
| 42 | + <li>Handle any text (including rare words)</li> |
| 43 | + <li>Break unknown words into known subwords</li> |
| 44 | + <li>Keep common words as single tokens</li> |
| 45 | + </ul> |
| 46 | + </div> |
| 47 | + |
| 48 | + {/* Interactive Demo */} |
| 49 | + <div className="bg-gray-800 rounded-lg p-6 space-y-4"> |
| 50 | + <h3 className="text-xl font-semibold text-emerald-400">Try it Yourself</h3> |
| 51 | + <div> |
| 52 | + <label className="block text-sm text-gray-400 mb-2">Enter text:</label> |
| 53 | + <input |
| 54 | + type="text" |
| 55 | + value={inputText} |
| 56 | + onChange={(e) => setInputText(e.target.value)} |
| 57 | + className="w-full bg-gray-700 text-white px-4 py-2 rounded border border-gray-600 focus:border-emerald-500 focus:outline-none" |
| 58 | + placeholder="Type anything..." |
| 59 | + /> |
| 60 | + </div> |
| 61 | + <button |
| 62 | + onClick={() => tokenize(inputText)} |
| 63 | + className="px-6 py-2 bg-emerald-600 hover:bg-emerald-700 rounded font-semibold transition-colors" |
| 64 | + > |
| 65 | + Tokenize |
| 66 | + </button> |
| 67 | + |
| 68 | + {tokens.length > 0 && ( |
| 69 | + <div className="mt-4"> |
| 70 | + <div className="text-sm text-gray-400 mb-2">Tokens ({tokens.length}):</div> |
| 71 | + <div className="flex flex-wrap gap-2"> |
| 72 | + {tokens.map((token, i) => ( |
| 73 | + <div key={i} className="bg-emerald-900 text-emerald-100 px-3 py-1 rounded text-sm font-mono"> |
| 74 | + {token} |
| 75 | + </div> |
| 76 | + ))} |
| 77 | + </div> |
| 78 | + </div> |
| 79 | + )} |
| 80 | + </div> |
| 81 | + |
| 82 | + {/* Embedding Explanation */} |
| 83 | + <div className="bg-gray-800 rounded-lg p-6 space-y-4"> |
| 84 | + <h3 className="text-xl font-semibold text-emerald-400">Token Embeddings</h3> |
| 85 | + <p className="text-gray-300"> |
| 86 | + Each token is converted to a <strong>learned embedding vector</strong> of size 768 (for GPT-2 Small). |
| 87 | + </p> |
| 88 | + <div className="bg-gray-900 p-4 rounded font-mono text-sm text-gray-300"> |
| 89 | + Token "Hello" → Token ID: 15496 → Embedding: [0.23, -0.45, 0.12, ..., 0.67] (768 dimensions) |
| 90 | + </div> |
| 91 | + <p className="text-gray-300"> |
| 92 | + These embeddings are <strong>learned during training</strong> so that similar tokens have similar vectors. |
| 93 | + </p> |
| 94 | + </div> |
| 95 | + |
| 96 | + {/* Exercise */} |
| 97 | + <div className="bg-blue-900 bg-opacity-30 border border-blue-700 rounded-lg p-6 space-y-4"> |
| 98 | + <h3 className="text-xl font-semibold text-blue-400">📝 Exercise</h3> |
| 99 | + <p className="text-gray-300"> |
| 100 | + Why does GPT-2 use Byte-Pair Encoding instead of word-level tokenization? |
| 101 | + </p> |
| 102 | + <textarea |
| 103 | + value={quizAnswer} |
| 104 | + onChange={(e) => setQuizAnswer(e.target.value)} |
| 105 | + className="w-full bg-gray-700 text-white px-4 py-2 rounded border border-gray-600 focus:border-blue-500 focus:outline-none h-24" |
| 106 | + placeholder="Your answer..." |
| 107 | + /> |
| 108 | + <button |
| 109 | + onClick={checkQuiz} |
| 110 | + className="px-6 py-2 bg-blue-600 hover:bg-blue-700 rounded font-semibold transition-colors" |
| 111 | + > |
| 112 | + Check Answer |
| 113 | + </button> |
| 114 | + {quizFeedback && ( |
| 115 | + <div className={`p-3 rounded ${quizFeedback.startsWith('✓') ? 'bg-green-900 text-green-200' : 'bg-red-900 text-red-200'}`}> |
| 116 | + {quizFeedback} |
| 117 | + </div> |
| 118 | + )} |
| 119 | + </div> |
| 120 | + |
| 121 | + {/* Navigation */} |
| 122 | + <div className="flex justify-end"> |
| 123 | + <button |
| 124 | + onClick={onNext} |
| 125 | + className="px-6 py-3 bg-emerald-600 hover:bg-emerald-700 rounded font-semibold transition-colors" |
| 126 | + > |
| 127 | + Next: Positional Encoding → |
| 128 | + </button> |
| 129 | + </div> |
| 130 | + </div> |
| 131 | + ); |
| 132 | +} |
0 commit comments