This is an annotated PyTorch experiment to train a hourglass.
This is based on training loop and configurations for a simple transformer auto-regressive NLP task.
14import math
15from typing import List
16
17import torch
18from torch import nn
19
20from labml import experiment
21from labml.configs import option
22from labml_helpers.module import Module
23from labml_nn.experiments.nlp_autoregression import NLPAutoRegressionConfigs
24from labml_nn.transformers.hour_glass import HourGlass
25from labml_nn.transformers.positional_encoding import PositionalEncoding28class AutoregressiveTransformer(Module):n_tokens
is the vocabulary size d_model
is the size of the token embeddings dropout
is the dropout probability hour_glass
is the hourglass model33 def __init__(self, n_tokens: int, d_model: int, dropout: float, hour_glass: HourGlass):40 super().__init__()Token embeddings
42 self.embedding = nn.Embedding(n_tokens, d_model)48 self.pos_embedding = PositionalEncoding(d_model, dropout)50 self.hour_glass = hour_glassTo normalize the final embeddings
52 self.norm = nn.LayerNorm([d_model])Embedding size
54 self.d_model = d_modelFinal linear layer to predict the logits
56 self.output = nn.Linear(d_model, n_tokens)x
is the tensor with token indexes of shape [seq_len, batch_size]
58 def __call__(self, x: torch.Tensor):Get embeddings
63 x = self.embedding(x)66 if self.pos_embedding is not None:
67 x = self.pos_embedding(x * math.sqrt(self.d_model))Hourglass
70 x = self.hour_glass(x)Get logits
73 output = self.output(self.norm(x))Return the logits
76 return output, NoneThis inherits from training loop and configurations for a simple transformer auto-regressive NLP task.
79class Configs(NLPAutoRegressionConfigs):Model
87 model: AutoregressiveTransformerNumber of attention heads
89 n_heads: int = 8Dropout probability
91 dropout: float = 0.1Size of feed-forward hidden layer
93 d_ff: int = 512Token embedding size
95 d_model: int = 256Shortening factors
97 shortening_factors: List[int] = [8, 4]Create the model
100@option(Configs.model)
101def _model(c: Configs):Create hourglass model
107 hour_glass = HourGlass(c.n_heads, c.d_model, c.dropout, c.d_ff, c.shortening_factors)Create the auto-regressive wrapper
109 m = AutoregressiveTransformer(c.n_tokens, c.d_model, c.dropout, hour_glass).to(c.device)112 return m115def main():Create experiment
117 experiment.create(name="hour_glass")Create configs
119 conf = Configs()Override configurations
121 experiment.configs(conf, {Use character level tokenizer
123 'tokenizer': 'character',Prompt separator is blank
125 'prompt_separator': '',Starting prompt for sampling
127 'prompt': 'It is ',Use Tiny Shakespeare dataset
129 'text': 'tiny_shakespeare',Use a context size of
132 'seq_len': 256,Train for epochs
134 'epochs': 128,Batch size
136 'batch_size': 32,Switch between training and validation for times per epoch
139 'inner_iterations': 10,Use Noam optimizer
142 'optimizer.optimizer': 'Noam',
143 'optimizer.learning_rate': 1.,145 })Set models for saving and loading
148 experiment.add_pytorch_models({'model': conf.model})Start the experiment
151 with experiment.start():Run training
153 conf.run()157if __name__ == '__main__':
158 main()