matlab-deep-learning
diff --git a/‎+gpt2/+layer/block.m‎
Lines changed: 82 additions & 0 deletions b/‎+gpt2/+layer/block.m‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎+gpt2/+tokenizer/GPT2Tokenizer.m‎
Lines changed: 249 additions & 0 deletions b/‎+gpt2/+tokenizer/GPT2Tokenizer.m‎
Lines changed: 249 additions & 0 deletions
diff --git a/‎+gpt2/download.m‎
Lines changed: 48 additions & 0 deletions b/‎+gpt2/download.m‎
Lines changed: 48 additions & 0 deletions
@@ -0,0 +1,82 @@
+function [X, present] = block(X, past, weights, hyperParameters)
+% block   Transformer block for GPT-2
+%
+%   [X, present] = block(X, past, weights, hyperParameters) computes a
+%   GPT-2 style transformer block on the input X as described in [1] (see
+%   Section 2.3). One difference between this style of transformer block
+%   and others is that this block uses layer normalization at the
+%   beginning.
+%
+%   Inputs:
+%       X               - A (numFeatures*numHeads)-by-numInputSubwords
+%                         input array.
+%       past            - A numFeatures-by-numPastSubwords-by-numHeads-by-2
+%                         array. This contains the 'keys' and 'values' for
+%                         past subwords. These are needed to predict future
+%                         outputs in an autoregressive manner. 'keys' are
+%                         stored in past(:,:,:,1) and 'values' are stored
+%                         in past(:,:,:,2).
+%       weights         - The weights for the transformer block stored in a
+%                         struct. In this block we have:
+%                           - ln_1_g_0: Weight vector for the first layer
+%                             normalization.
+%                           - ln_1_b_0: Bias vector for the first layer
+%                             normalization.
+%                           - ln_2_g_0: Weight vector for the second layer
+%                             normalization.
+%                           - ln_2_b_0: Bias vector for the second layer
+%                             normalization.
+%                         In the attention sub-block:
+%                           - attn_c_attn_w_0: A weight matrix for the
+%                             first fully connected layer.
+%                           - attn_c_attn_b_0: A bias vector for the first
+%                             fully connected layer.
+%                           - attn_c_proj_w_0: A weight matrix for the
+%                             final fully connected layer.
+%                           - attn_c_proj_b_0: A bias vector for the final
+%                             fully connected layer.
+%                         In the multi-layer perceptron block:
+%                           - mlp_c_fc_w_0: A weight matrix for the first
+%                             fully connected layer.
+%                           - mlp_c_fc_b_0: A bias vector for the first
+%                             fully connected layer.
+%                           - mlp_c_proj_w_0: A weight matrix for the
+%                             second fully connected layer.
+%                           - mlp_c_proj_b_0: A bias vector for the second
+%                             fully connected layer.
+%       numHeads        - The number of attention heads. This is a
+%                         hyper-parameter.
+%
+%   Outputs:
+%       Z               - A (numFeatures*numHeads)-by-numInputSubwords
+%                         output array.
+%       present         - A numFeatures-by-numAllSubwords-by-numHeads-by-2
+%                         array. This contains the 'keys' and 'values' that
+%                         are created from inputs. These need to passed
+%                         back in as the 'past' input if we want to predict
+%                         future outputs in an autoregressive manner. 'keys'
+%                         are stored in present(:,:,:,1) and 'values' are
+%                         stored in present(:,:,:,2).
+%
+%   References:
+%
+%   [1] Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei,
+%       Ilya Sutskever, "Language Models are Unsupervised Multitask
+%       Learners",
+%       https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf
+
+XNorm1 = transformer.layer.normalization(X, ...
+    weights.ln_1_g_0, weights.ln_1_b_0);
+
+[A, present] = transformer.layer.attention(XNorm1, past, weights, hyperParameters);
+
+X = X + A;
+ 
+XNorm2 = transformer.layer.normalization(X, ...
+    weights.ln_2_g_0, weights.ln_2_b_0);
+
+M = transformer.layer.multiLayerPerceptron(XNorm2, weights);
+
+X = X + M;
+
+end
@@ -0,0 +1,249 @@
+classdef GPT2Tokenizer < handle
+    % GPT2Tokenizer   Object for encoding text so it can be fed to GPT2
+    
+    properties(SetAccess = private)
+        % Encoding
+        Encoding
+        
+        % BPERanks
+        BPERanks
+        
+        % Cache
+        Cache = containers.Map()
+    end
+    
+    properties(Constant)
+        % TokenizationExpression   Regular expression used for tokenization
+        %
+        %   This is the regular expression used for the first stage of
+        %   tokenization. It was hard-coded by the creators of GPT-2. It
+        %   appears to apply a tokenization rule that can be summarised as
+        %   follows:
+        %
+        %   A token is one of the following things:
+        %
+        %   - An exact string match for 's, 't, 're, 've, 'm, 'll, or 'd.
+        %     This means common contractions in words like don't and you'll
+        %     will get split into their own tokens.
+        %   - Zero or one spaces followed by one or more Unicode letters.
+        %   - Zero or one spaces followed by one or more Unicode numbers.
+        %   - Zero or one spaces followed by one or more things that are
+        %     not whitespace, a Unicode letter or a Unicode number.
+        %   - One or more whitespace characters not followed by a
+        %     non-whitepace character. This is tricky to understand, but
+        %     basically it means that a string with a word preceeded by
+        %     several spaces like '   Hello' will get split into '  ' and
+        %     ' Hello'.
+        %   - One or more whitespace characters.
+        %
+        %   Note that we have had to modify the original expression, which
+        %   is shown below:
+        %
+        %       '''s|''t|''re|''ve|''m|''ll|''d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+'
+        %
+        %   MATLAB's regexp function does not support the \p flag, so we
+        %   have replaced it with something with equivalent functionality.
+        TokenizationExpression = '''s|''t|''re|''ve|''m|''ll|''d| ?((?![\d_])\w)+| ?\d+| ?(_|[^\s\w])+|\s+(?!\S)|\s+';
+        
+        % ByteEncoder   Encodes bytes into a set of 256 Unicode characters
+        %
+        %   The size of the output vocabulary from this encoder determines
+        %   the size of the embedding needed by the GPT-2 transformer
+        %   model. The creators of GPT-2 wanted to keep this at around
+        %   50,000. However, they wanted to be able to encode any Unicode
+        %   string. Unicode has potentially hundreds of thousands of
+        %   characters. So to keep the overall vocabulary low, we go
+        %   through an extra encoding stage:
+        %   
+        %   - The raw Unicode string (which can contain any Unicode
+        %     character) is converted into UTF-8 bytes. Note that UTF-8 is
+        %     a variable length encoding scheme, so each character can get
+        %     mapped to between 1 to 4 bytes.
+        %   - These individual bytes are then mapped to a restricted
+        %     vocabulary of 256 Unicode characters. ByteEncoder defines
+        %     this mapping.
+        ByteEncoder = iBytesToUnicode()
+    end
+    
+    methods
+        function this = GPT2Tokenizer(modelName, modelsDirectory)
+            % Read in the vocabulary. The UTF-8 part is really important to
+            % make this work on Windows.
+            fid = fopen([modelsDirectory filesep() modelName filesep() 'vocab.bpe'], 'r', 'n', 'UTF-8');
+            bpeData = textscan(fid,'%s', 'Delimiter', '\n');
+            fclose(fid);
+            
+            bpeData = bpeData{1};   % textscan always reads everything in a cell
+            bpeData(1) = [];        % Delete the first line we read in (it's a comment)
+            
+            % Split the bpe data into two columns.
+            this.BPERanks = split(string(bpeData));
+            
+            % Read in the encoding data. The UTF-8 part is really important
+            % to make this work on Windows.
+            fid = fopen([modelsDirectory filesep() modelName filesep() 'encoder.txt'], 'r', 'n', 'UTF-8');
+            encoderData = textscan(fid,'%s', 'Delimiter', '\n');
+            fclose(fid);
+            
+            encoderData = encoderData{1};
+            
+            % Set the encoding
+            this.Encoding = string(encoderData);
+        end
+        
+        function numericTokens = encode(this, text)
+            
+            % Note that this function returns tokens with indices that
+            % begin at 1. The Python implementation indexes from 0.
+            
+            % Step 1: Apply regular expression to split text into words.
+            % See the comment for 'TokenizationExpression' for more detail
+            % on what is going on here.
+            [inputTokens, ~] = regexp( ...
+                text, ...
+                this.TokenizationExpression, ...
+                'match', 'split');
+            
+            % Step 2: The incoming text is Unicode. Unicode has a huge set
+            % of characters. We do not want our BPE algorithm to deal with
+            % a huge set of characters, because that will inflate the BPE
+            % vocabulary. So we need to reduce the set of characters. We do
+            % this by converting the Unicode text to the UTF-8 encoding,
+            % and then we replace each UTF-8 byte with another Unicode
+            % character, out of a set of 256 Unicode characters. This will
+            % mean that our original Unicode string which could have
+            % contained any Unicode character will now contain only one of
+            % 256 characters.
+            encodedTokens = cellfun( @(x)unicode2native(x, 'UTF-8'), ...
+                inputTokens, 'UniformOutput', false );
+            encodedTokens = cellfun( @(x)this.ByteEncoder(x+1), ...
+                encodedTokens, 'UniformOutput', false );
+            
+            % Step 3: Do the BPE encoding on a per word basis. Words are
+            % either left as they are, or for rare words we split them into
+            % word fragments.
+            bpeTokens = cellfun(@(x)this.bpe(x), encodedTokens, 'UniformOutput', false);
+            
+            % Step 4: Look up each word or word fragment and replace it
+            % with a number.
+            numericTokens = [];
+            for i = 1:numel(bpeTokens)
+                bpeTokensSplit = split(bpeTokens{i});
+                for j = 1:numel(bpeTokensSplit)
+                    numericTokens = [numericTokens find(this.Encoding == bpeTokensSplit(j))]; %#ok<AGROW>
+                end
+            end
+        end
+        
+        function text = decode(this, numericTokens)
+            
+            % Note that this function expects tokens that begin at 1!
+            
+            % Step 1: Turn tokens into text
+            text = join(this.Encoding(numericTokens),'');
+            
+            % Step 2: Replace characters with byte values
+            [~,text] = max( char(text) == this.ByteEncoder' );
+            text = text -1;
+            
+            % Step 3: Decode byte values as UTF-8
+            text = native2unicode(text, 'UTF-8');
+        end
+    end
+    
+    methods(Access = private)
+        function word = bpe(this, token)
+            if this.Cache.isKey(token)
+                word = this.Cache(token);
+            elseif isempty(token)
+                word = token;
+            else
+                wordFragments = string(num2cell(token));
+                pairs = iGetPairs(wordFragments);
+                
+                while true
+                    matches = [];
+                    for i = 1:numel(pairs)
+                        match = find(sum(pairs{i} == this.BPERanks, 2) == 2);
+                        matches = [matches match]; %#ok<AGROW>
+                    end
+                    minIndex = min(matches);
+                    if isempty(minIndex)
+                        break;
+                    end
+                    bigram = this.BPERanks(minIndex,:);
+                    
+                    first = bigram(1);
+                    second = bigram(2);
+                    newWordFragments = [];
+                    i = 1;
+                    while i < length(wordFragments)+1
+                        j = find( ...
+                            wordFragments == first & ...
+                            [zeros(1,(i-1)) ones(1,length(wordFragments)-i+1)]);
+                        if isempty(j)
+                            newWordFragments = [newWordFragments wordFragments(i:end)]; %#ok<AGROW>
+                            break
+                        else
+                            newWordFragments = [newWordFragments wordFragments(i:(j(1)-1))]; %#ok<AGROW>
+                            i = j(1);
+                        end
+                        
+                        if wordFragments(i) == first && ...
+                                i < length(wordFragments) && ...
+                                wordFragments(i+1) == second
+                            newWordFragments = [newWordFragments first+second]; %#ok<AGROW>
+                            i = i + 2;
+                        else
+                            newWordFragments = [newWordFragments wordFragments(i)]; %#ok<AGROW>
+                            i = i + 1;
+                        end
+                    end
+                    
+                    % We have a new word because we have merged some of the
+                    % word fragments. If there is only one element in
+                    % 'wordFragments', we have merges all of the fragments,
+                    % and can stop now, so we break. Otherwise, we generate
+                    % pairs again, and start the process again.
+                    wordFragments = newWordFragments;
+                    if numel(wordFragments) == 1
+                        break;
+                    else
+                        pairs = iGetPairs(wordFragments);
+                    end
+                end
+                
+                word = join(wordFragments, ' ');
+                this.Cache(token) = word;
+            end
+        end
+    end
+end
+
+function cs = iBytesToUnicode()
+% Note that the third character here is not the letter i! It is the
+% extended Unicode character corresponding to the number 161.
+%cs = ['!':'~' '¡':'¬' '®':'ÿ'];
+cs = char([33:126 161:172 174:255]);
+bs = double(cs);
+n = 0;
+for b = 0:255
+    if ~any(b == bs)
+        bs = [bs b]; %#ok<AGROW>
+        cs = [cs 256+n]; %#ok<AGROW>
+        n = n + 1;
+    end
+end
+[~,sortedIndices] = sort(bs);
+cs = cs(sortedIndices);
+end
+
+function pairs = iGetPairs(wordFragments)
+numLetters = length(wordFragments);
+pairIndices = [1:(numLetters-1); 2:numLetters]';
+pairIndices = mat2cell(pairIndices, ones(numLetters-1,1), 2);
+pairs = cellfun(@(x)wordFragments(x), pairIndices, ...
+    'UniformOutput', false);
+pairs = cellfun(@(x)[string(x(1)) string(x(2))], pairs, ...
+    'UniformOutput', false);
+end
@@ -0,0 +1,48 @@
+function download()
+% download   Download all of the necessary weight files for transformer model
+%
+%   download() will download all of the files that define the pretrained
+%   GPT-2 355M model.
+
+% Create directories for the model.
+modelType = 'gpt2-355M';
+modelDirectory = fullfile(fileparts(mfilename('fullpath')),'..',modelType);
+iCreateDirectoryIfItDoesNotExist(modelDirectory);
+
+% Download 'encoder.txt'. This is equivalent to 'encoder.json' from the
+% original GPT-2.
+iDownloadFileIfItDoesNotExist( ...
+    fullfile(modelDirectory,'encoder.txt'), ...
+    'https://ssd.mathworks.com/supportfiles/nnet/data/networks/gpt2_encoder.txt' );
+
+% Download 'vocab.bpe'. This file contains the BPE ranks for the encoder.
+% This file is identical to the one used by the original OpenAI repo.
+iDownloadFileIfItDoesNotExist( ...
+    fullfile(modelDirectory,'vocab.bpe'), ...
+    'https://ssd.mathworks.com/supportfiles/nnet/data/networks/gpt2_vocab.bpe' );
+
+% Download 'parameters.mat'. This contains all of the weights in the GPT-2
+% model. They have been exported from the original TensorFlow
+% implementation.
+iDownloadFileIfItDoesNotExist( ...
+    fullfile(modelDirectory,'parameters.mat'), ...
+    'https://ssd.mathworks.com/supportfiles/nnet/data/networks/gpt2_355M_params.mat' );
+end
+
+function iCreateDirectoryIfItDoesNotExist(directory)
+if ~exist(directory, 'dir')
+    fprintf('Creating directory ''%s''...\n', directory);
+    mkdir(directory);
+else
+    fprintf('Skipped creating directory ''%s'' as it already exists\n', directory);
+end
+end
+
+function iDownloadFileIfItDoesNotExist(destination, source)
+if ~exist(destination, 'file')
+    fprintf('Downloading file ''%s'' ...\n', destination);
+    websave(destination, source);
+else
+    fprintf('Skipped downloading file ''%s'' as it already exists\n', destination);
+end
+end