Making a Llama or GPT Mannequin for Subsequent-Token Prediction

import dataclasses

import torch

import torch.nn as nn

import torch.nn.purposeful as F

from torch import Tensor

@dataclasses.dataclass

class LlamaConfig:

“”“Outline Llama mannequin hyperparameters.”“”

vocab_size: int = 50000 # Dimension of the tokenizer vocabulary

max_position_embeddings: int = 2048 # Most sequence size

hidden_size: int = 768 # Dimension of hidden layers

intermediate_size: int = 4*768 # Dimension of MLP’s hidden layer

num_hidden_layers: int = 12 # Variety of transformer layers

num_attention_heads: int = 12 # Variety of consideration heads

num_key_value_heads: int = 3 # Variety of key-value heads for GQA

def rotate_half(x: Tensor) -> Tensor:

“”“Rotates half the hidden dims of the enter.

It is a helper perform for rotary place embeddings (RoPE).

For a tensor of form (…, d), it returns a tensor the place the final

d/2 dimensions are rotated by swapping and negating.

Args:

x: Enter tensor of form (…, d)

Returns:

Tensor of identical form with rotated final dimension

““”

x1, x2 = x.chunk(2, dim=–1)

return torch.cat((–x2, x1), dim=–1) # Concatenate with rotation

class RotaryPositionEncoding(nn.Module):

“”“Rotary place encoding.”“”

def __init__(self, dim: int, max_position_embeddings: int) -> None:

“”“Initialize the RotaryPositionEncoding module

Args:

dim: The hidden dimension of the enter tensor to which RoPE is utilized

max_position_embeddings: The utmost sequence size of the enter tensor

““”

tremendous().__init__()

self.dim = dim

self.max_position_embeddings = max_position_embeddings

# compute a matrix of ntheta_i

N = 10_000.0

inv_freq = 1.0 / (N ** (torch.arange(0, dim, 2).float() / dim))

inv_freq = torch.cat((inv_freq, inv_freq), dim=–1)

place = torch.arange(max_position_embeddings).float()

sinusoid_inp = torch.outer(place, inv_freq)

# save cosine and sine matrices as buffers, not parameters

self.register_buffer(“cos”, sinusoid_inp.cos())

self.register_buffer(“sin”, sinusoid_inp.sin())

def ahead(self, x: Tensor) -> Tensor:

“”“Apply RoPE to tensor x

Args:

x: Enter tensor of form (batch_size, seq_length, num_heads, head_dim)

Returns:

Output tensor of form (batch_size, seq_length, num_heads, head_dim)

““”

batch_size, seq_len, num_heads, head_dim = x.form

dtype = x.dtype

# remodel the cosine and sine matrices to 4D tensor and the identical dtype as x

cos = self.cos.to(dtype)[:seq_len].view(1, seq_len, 1, –1)

sin = self.sin.to(dtype)[:seq_len].view(1, seq_len, 1, –1)

# apply RoPE to x

output = (x * cos) + (rotate_half(x) * sin)

return output

class LlamaAttention(nn.Module):

“”“Grouped-query consideration with rotary embeddings.”“”

def __init__(self, config: LlamaConfig) -> None:

tremendous().__init__()

self.hidden_size = config.hidden_size

self.num_heads = config.num_attention_heads

self.head_dim = self.hidden_size // self.num_heads

self.num_kv_heads = config.num_key_value_heads # GQA: H_kv < H_q

# hidden_size have to be divisible by num_heads

assert (self.head_dim * self.num_heads) == self.hidden_dimension

# Linear layers for Q, Ok, V projections

self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)

self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)

self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)

self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

def ahead(self, hidden_states: Tensor, rope: RotaryPositionEncoding, attn_mask: Tensor) -> Tensor:

bs, seq_len, dim = hidden_states.dimension()

# Undertaking inputs to Q, Ok, V

query_states = self.q_proj(hidden_states).view(bs, seq_len, self.num_heads, self.head_dim)

key_states = self.k_proj(hidden_states).view(bs, seq_len, self.num_kv_heads, self.head_dim)

value_states = self.v_proj(hidden_states).view(bs, seq_len, self.num_kv_heads, self.head_dim)

# Apply rotary place embeddings

query_states = rope(query_states)

key_states = rope(key_states)

# Transpose tensors from BSHD to BHSD dimension for scaled_dot_product_attention

query_states = query_states.transpose(1, 2)

key_states = key_states.transpose(1, 2)

value_states = value_states.transpose(1, 2)

# Use PyTorch’s optimized consideration implementation

# setting is_causal=True is incompatible with setting specific consideration masks

attn_output = F.scaled_dot_product_attention(

query_states,

key_states,

value_states,

attn_mask=attn_mask,

dropout_p=0.0,

enable_gqa=True,

)

# Transpose output tensor from BHSD to BSHD dimension, reshape to 3D, after which venture output

attn_output = attn_output.transpose(1, 2).reshape(bs, seq_len, self.hidden_size)

attn_output = self.o_proj(attn_output)

return attn_output

class LlamaMLP(nn.Module):

“”“Feed-forward community with SwiGLU activation.”“”

def __init__(self, config: LlamaConfig) -> None:

tremendous().__init__()

# Two parallel projections for SwiGLU

self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)

self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)

self.act_fn = F.silu # SwiGLU activation perform

# Undertaking again to hidden dimension

self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)

def ahead(self, x: Tensor) -> Tensor:

# SwiGLU activation: multiply gate and up-projected inputs

gate = self.act_fn(self.gate_proj(x))

up = self.up_proj(x)

return self.down_proj(gate * up)

class LlamaDecoderLayer(nn.Module):

“”“Single transformer layer for a Llama mannequin.”“”

def __init__(self, config: LlamaConfig) -> None:

tremendous().__init__()

self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=1e–5)

self.self_attn = LlamaAttention(config)

self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, eps=1e–5)

self.mlp = LlamaMLP(config)

def ahead(self, hidden_states: Tensor, rope: RotaryPositionEncoding, attn_mask: Tensor) -> Tensor:

# First residual block: Self-attention

residual = hidden_states

hidden_states = self.input_layernorm(hidden_states)

attn_outputs = self.self_attn(hidden_states, rope=rope, attn_mask=attn_mask)

hidden_states = attn_outputs + residual

# Second residual block: MLP

residual = hidden_states

hidden_states = self.post_attention_layernorm(hidden_states)

hidden_states = self.mlp(hidden_states) + residual

return hidden_states

class LlamaModel(nn.Module):

“”“The total Llama mannequin with none pretraining heads.”“”

def __init__(self, config: LlamaConfig) -> None:

tremendous().__init__()

self.rotary_emb = RotaryPositionEncoding(

config.hidden_size // config.num_attention_heads,

config.max_position_embeddings,

)

self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)

self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])

self.norm = nn.RMSNorm(config.hidden_size, eps=1e–5)

def ahead(self, input_ids: Tensor, attn_mask: Tensor) -> Tensor:

# Convert enter token IDs to embeddings

hidden_states = self.embed_tokens(input_ids)

# Course of by way of all transformer layers, then the ultimate norm layer

for layer in self.layers:

hidden_states = layer(hidden_states, rope=self.rotary_emb, attn_mask=attn_mask)

hidden_states = self.norm(hidden_states)

# Return the ultimate hidden states

return hidden_states

class LlamaForPretraining(nn.Module):

def __init__(self, config: LlamaConfig) -> None:

tremendous().__init__()

self.base_model = LlamaModel(config)

self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

def ahead(self, input_ids: Tensor, attn_mask: Tensor) -> Tensor:

hidden_states = self.base_model(input_ids, attn_mask)

return self.lm_head(hidden_states)

def create_causal_mask(seq_len: int, gadget: torch.gadget, dtype: torch.dtype = torch.float32) -> Tensor:

“”“Create a causal masks for self-attention.

Args:

seq_len: Size of the sequence

gadget: Gadget to create the masks on

dtype: Knowledge kind of the masks

Returns:

Causal masks of form (seq_len, seq_len)

““”

masks = torch.full((seq_len, seq_len), float(‘-inf’), gadget=gadget, dtype=dtype)

.triu(diagonal=1)

return masks

def create_padding_mask(batch, padding_token_id, gadget: torch.gadget, dtype: torch.dtype = torch.float32):

“”“Create a padding masks for a batch of sequences for self-attention.

Args:

batch: Batch of sequences, form (batch_size, seq_len)

padding_token_id: ID of the padding token

Returns:

Padding masks of form (batch_size, 1, seq_len, seq_len)

““”

padded = torch.zeros_like(batch, gadget=gadget, dtype=dtype)

.masked_fill(batch == padding_token_id, float(‘-inf’))

masks = padded[:,:,None] + padded[:,None,:]

return masks[:, None, :, :]

# Create mannequin with default config

test_config = LlamaConfig()

gadget = torch.gadget(“cuda”) if torch.cuda.is_available() else torch.gadget(“cpu”)

mannequin = LlamaModel(test_config).to(gadget)

# print the mannequin dimension

print(f“Mannequin parameters dimension: {sum(p.numel() for p in mannequin.parameters()) / 1024**2:.2f} MB”)

print(f“Mannequin buffers dimension: {sum(p.numel() for p in mannequin.buffers()) / 1024**2:.2f} MB”)

# Create a random tensor

PAD_TOKEN_ID = 0

bs, seq_len = 5, 13

x = torch.randint(1, test_config.vocab_size, (bs, seq_len), dtype=torch.int32, gadget=gadget)

# set random size of padding tokens on the finish of every sequence

for i, pad_length in enumerate([4, 1, 0, 3, 8]):

if pad_length > 0:

x[i, –pad_length:] = PAD_TOKEN_ID

# Create causal and padding masks

causal_mask = create_causal_mask(seq_len, gadget)

padding_mask = create_padding_mask(x, PAD_TOKEN_ID, gadget)

attn_mask = causal_mask + padding_mask

print(f“Enter ids: {x}”)

print(f“Consideration masks: {attn_mask}”)

# Run the mannequin

output = mannequin(x, attn_mask)

print(“OK”)

Making a Llama or GPT Mannequin for Subsequent-Token Prediction

The Machine Studying “Creation Calendar” Day 13: LASSO and Ridge Regression in Excel

Scaling MLflow for enterprise AI: What’s New in SageMaker AI with MLflow

Scaling MLflow for enterprise AI: What’s New in SageMaker AI with MLflow

Leave a Reply Cancel reply

Popular News

Greatest practices for Amazon SageMaker HyperPod activity governance

How Cursor Really Indexes Your Codebase

Speed up edge AI improvement with SiMa.ai Edgematic with a seamless AWS integration

Unlocking Japanese LLMs with AWS Trainium: Innovators Showcase from the AWS LLM Growth Assist Program

Optimizing Mixtral 8x7B on Amazon SageMaker with AWS Inferentia2

About Us

Category

Recent Posts