This model is significantly undertrained and designed for research purposes only.
For use in transformers:
from transformers import AutoTokenizer, GPT2Model
import torch.nn as nn
import torch
class RMSLayerNorm(nn.Module):
def __init__(self, normalized_shape, eps=1e-8, affine=True):
super(RMSLayerNorm, self).__init__()
self.normalized_shape = normalized_shape
self.eps = eps
self.affine = affine
if self.affine:
self.weight = nn.Parameter(torch.ones(()))
else:
self.register_parameter('weight', None)
self.register_parameter('bias', None)
def forward(self, x):
rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
x_normalized = x / rms
if self.affine:
x_normalized = x_normalized * self.weight
return x_normalized
def replace(model):
for name, child in model.named_children():
if isinstance(child, nn.modules.normalization.LayerNorm):
setattr(model, name, RMSLayerNorm(child.normalized_shape, eps=child.eps, affine=True))
else:
replace(child)
return model
class GPTR2Model(GPT2Model):
def __init__(self, config):
super().__init__(config)
replace(self)
model = GPTR2Model.from_pretrained("George-Ogden/gptr2-nano-without-momentum-with-weight-decay")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
For more details and example usage, see https://github.com/George-Ogden/residual-streams