Making a Llama or GPT Mannequin for Subsequent-Token Prediction

import dataclasses

import torch

import torch.nn as nn

import torch.nn.practical as F

from torch import Tensor

@dataclasses.dataclass

class LlamaConfig:

“”“Outline Llama mannequin hyperparameters.”“”

vocab_size: int = 50000 # Measurement of the tokenizer vocabulary

max_position_embeddings: int = 2048 # Most sequence size

hidden_size: int = 768 # Dimension of hidden layers

intermediate_size: int = 4*768 # Dimension of MLP’s hidden layer

num_hidden_layers: int = 12 # Variety of transformer layers

num_attention_heads: int = 12 # Variety of consideration heads

num_key_value_heads: int = 3 # Variety of key-value heads for GQA

def rotate_half(x: Tensor) -> Tensor:

“”“Rotates half the hidden dims of the enter.

This can be a helper operate for rotary place embeddings (RoPE).

For a tensor of form (…, d), it returns a tensor the place the final

d/2 dimensions are rotated by swapping and negating.

Args:

x: Enter tensor of form (…, d)

Returns:

Tensor of similar form with rotated final dimension

““”

x1, x2 = x.chunk(2, dim=–1)

return torch.cat((–x2, x1), dim=–1) # Concatenate with rotation

class RotaryPositionEncoding(nn.Module):

“”“Rotary place encoding.”“”

def __init__(self, dim: int, max_position_embeddings: int) -> None:

“”“Initialize the RotaryPositionEncoding module

Args:

dim: The hidden dimension of the enter tensor to which RoPE is utilized

max_position_embeddings: The utmost sequence size of the enter tensor

““”

tremendous().__init__()

self.dim = dim

self.max_position_embeddings = max_position_embeddings

# compute a matrix of ntheta_i

N = 10_000.0

inv_freq = 1.0 / (N ** (torch.arange(0, dim, 2) / dim))

inv_freq = torch.cat((inv_freq, inv_freq), dim=–1)

place = torch.arange(max_position_embeddings)

sinusoid_inp = torch.outer(place, inv_freq)

# save cosine and sine matrices as buffers, not parameters

self.register_buffer(“cos”, sinusoid_inp.cos())

self.register_buffer(“sin”, sinusoid_inp.sin())

def ahead(self, x: Tensor) -> Tensor:

“”“Apply RoPE to tensor x

Args:

x: Enter tensor of form (batch_size, seq_length, num_heads, head_dim)

Returns:

Output tensor of form (batch_size, seq_length, num_heads, head_dim)

““”

batch_size, seq_len, num_heads, head_dim = x.form

dtype = x.dtype

# rework the cosine and sine matrices to 4D tensor and the identical dtype as x

cos = self.cos.to(dtype)[:seq_len].view(1, seq_len, 1, –1)

sin = self.sin.to(dtype)[:seq_len].view(1, seq_len, 1, –1)

# apply RoPE to x

output = (x * cos) + (rotate_half(x) * sin)

return output

class LlamaAttention(nn.Module):

“”“Grouped-query consideration with rotary embeddings.”“”

def __init__(self, config: LlamaConfig) -> None:

tremendous().__init__()

self.hidden_size = config.hidden_size

self.num_heads = config.num_attention_heads

self.head_dim = self.hidden_size // self.num_heads

self.num_kv_heads = config.num_key_value_heads # GQA: H_kv < H_q

# hidden_size should be divisible by num_heads

assert (self.head_dim * self.num_heads) == self.hidden_measurement

# Linear layers for Q, Okay, V projections

self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)

self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)

self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)

self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

def ahead(self, hidden_states: Tensor, rope: RotaryPositionEncoding, attn_mask: Tensor) -> Tensor:

bs, seq_len, dim = hidden_states.measurement()

# Undertaking inputs to Q, Okay, V

query_states = self.q_proj(hidden_states).view(bs, seq_len, self.num_heads, self.head_dim)

key_states = self.k_proj(hidden_states).view(bs, seq_len, self.num_kv_heads, self.head_dim)

value_states = self.v_proj(hidden_states).view(bs, seq_len, self.num_kv_heads, self.head_dim)

# Apply rotary place embeddings

query_states = rope(query_states)

key_states = rope(key_states)

# Transpose tensors from BSHD to BHSD dimension for scaled_dot_product_attention

query_states = query_states.transpose(1, 2)

key_states = key_states.transpose(1, 2)

value_states = value_states.transpose(1, 2)

# Use PyTorch’s optimized consideration implementation

# setting is_causal=True is incompatible with setting express consideration masks

attn_output = F.scaled_dot_product_attention(

query_states,

key_states,

value_states,

attn_mask=attn_mask,

dropout_p=0.0,

enable_gqa=True,

)

# Transpose output tensor from BHSD to BSHD dimension, reshape to 3D, after which undertaking output

attn_output = attn_output.transpose(1, 2).reshape(bs, seq_len, self.hidden_size)

attn_output = self.o_proj(attn_output)

return attn_output

class LlamaMLP(nn.Module):

“”“Feed-forward community with SwiGLU activation.”“”

def __init__(self, config: LlamaConfig) -> None:

tremendous().__init__()

# Two parallel projections for SwiGLU

self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)

self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)

self.act_fn = F.silu # SwiGLU activation operate

# Undertaking again to hidden measurement

self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)

def ahead(self, x: Tensor) -> Tensor:

# SwiGLU activation: multiply gate and up-projected inputs

gate = self.act_fn(self.gate_proj(x))

up = self.up_proj(x)

return self.down_proj(gate * up)

class LlamaDecoderLayer(nn.Module):

“”“Single transformer layer for a Llama mannequin.”“”

def __init__(self, config: LlamaConfig) -> None:

tremendous().__init__()

self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=1e–5)

self.self_attn = LlamaAttention(config)

self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, eps=1e–5)

self.mlp = LlamaMLP(config)

def ahead(self, hidden_states: Tensor, rope: RotaryPositionEncoding, attn_mask: Tensor) -> Tensor:

# First residual block: Self-attention

residual = hidden_states

hidden_states = self.input_layernorm(hidden_states)

attn_outputs = self.self_attn(hidden_states, rope=rope, attn_mask=attn_mask)

hidden_states = attn_outputs + residual

# Second residual block: MLP

residual = hidden_states

hidden_states = self.post_attention_layernorm(hidden_states)

hidden_states = self.mlp(hidden_states) + residual

return hidden_states

class LlamaModel(nn.Module):

“”“The total Llama mannequin with none pretraining heads.”“”

def __init__(self, config: LlamaConfig) -> None:

tremendous().__init__()

self.rotary_emb = RotaryPositionEncoding(

config.hidden_size // config.num_attention_heads,

config.max_position_embeddings,

)

self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)

self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])

self.norm = nn.RMSNorm(config.hidden_size, eps=1e–5)

def ahead(self, input_ids: Tensor, attn_mask: Tensor) -> Tensor:

# Convert enter token IDs to embeddings

hidden_states = self.embed_tokens(input_ids)

# Course of via all transformer layers, then the ultimate norm layer

for layer in self.layers:

hidden_states = layer(hidden_states, rope=self.rotary_emb, attn_mask=attn_mask)

hidden_states = self.norm(hidden_states)

# Return the ultimate hidden states

return hidden_states

class LlamaForPretraining(nn.Module):

def __init__(self, config: LlamaConfig) -> None:

tremendous().__init__()

self.base_model = LlamaModel(config)

self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

def ahead(self, input_ids: Tensor, attn_mask: Tensor) -> Tensor:

hidden_states = self.base_model(input_ids, attn_mask)

return self.lm_head(hidden_states)

def create_causal_mask(seq_len: int, gadget: torch.gadget, dtype: torch.dtype = torch.float32) -> Tensor:

“”“Create a causal masks for self-attention.

Args:

seq_len: Size of the sequence

gadget: Gadget to create the masks on

dtype: Information kind of the masks

Returns:

Causal masks of form (seq_len, seq_len)

““”

masks = torch.full((seq_len, seq_len), float(‘-inf’), gadget=gadget, dtype=dtype)

.triu(diagonal=1)

return masks

def create_padding_mask(batch, padding_token_id, gadget: torch.gadget, dtype: torch.dtype = torch.float32):

“”“Create a padding masks for a batch of sequences for self-attention.

Args:

batch: Batch of sequences, form (batch_size, seq_len)

padding_token_id: ID of the padding token

Returns:

Padding masks of form (batch_size, 1, seq_len, seq_len)

““”

padded = torch.zeros_like(batch, gadget=gadget, dtype=dtype)

.masked_fill(batch == padding_token_id, float(‘-inf’))

masks = padded[:,:,None] + padded[:,None,:]

return masks[:, None, :, :]

# Create mannequin with default config

model_config = LlamaConfig()

gadget = torch.gadget(“cuda”) if torch.cuda.is_available() else torch.gadget(“cpu”)

mannequin = LlamaForPretraining(model_config).to(gadget)

# print the mannequin measurement

print(f“Mannequin parameters measurement: {sum(p.numel() for p in mannequin.parameters()) / 1024**2:.2f} M”)

print(f“Mannequin buffers measurement: {sum(p.numel() for p in mannequin.buffers()) / 1024**2:.2f} M”)

# Create a random tensor

PAD_TOKEN_ID = 0

bs, seq_len = 5, 13

x = torch.randint(1, model_config.vocab_size, (bs, seq_len), dtype=torch.int32, gadget=gadget)

# set random size of padding tokens on the finish of every sequence

for i, pad_length in enumerate([4, 1, 0, 3, 8]):

if pad_length > 0:

x[i, –pad_length:] = PAD_TOKEN_ID

# Create causal and padding masks

causal_mask = create_causal_mask(seq_len, gadget)

padding_mask = create_padding_mask(x, PAD_TOKEN_ID, gadget)

attn_mask = causal_mask + padding_mask

print(f“Enter ids: {x}”)

print(f“Consideration masks: {attn_mask}”)

# Run the mannequin

output = mannequin(x, attn_mask)

print(“OK”)

Making a Llama or GPT Mannequin for Subsequent-Token Prediction

Admin

3 Hidden Costco Gems You've got Been Lacking Out On

Leave a Reply Cancel reply

Recommended.

Hacker Behind Wired.com Leak Now Promoting Full 40M Condé Nast Information – Hackread – Cybersecurity Information, Information Breaches, AI, and Extra

Prime 10 Greatest Finish-to-Finish Menace Intelligence Corporations in 2025

Trending.

Backrooms director Kane Parsons explains the birds, the portals, and his sensible results

100 Most Costly Key phrases for Google Advertisements in 2026

Resident Evil followers have adopted a Love & Deepspace character because the son of Leon S. Kennedy and one in every of his potential spouses

Random Forest Algorithm in Machine Studying With Instance

The Full Information to EcoGPT

AimactGrow

Categories

Recent News

Jackie Welles and Judy Alvarez

Learn how to Set Up a Staging-to-Manufacturing Deployment Workflow for a Web site