• About Us
  • Privacy Policy
  • Disclaimer
  • Contact Us
AimactGrow
  • Home
  • Technology
  • AI
  • SEO
  • Coding
  • Gaming
  • Cybersecurity
  • Digital marketing
No Result
View All Result
  • Home
  • Technology
  • AI
  • SEO
  • Coding
  • Gaming
  • Cybersecurity
  • Digital marketing
No Result
View All Result
AimactGrow
No Result
View All Result

Making a Llama or GPT Mannequin for Subsequent-Token Prediction

Admin by Admin
January 16, 2026
Home AI
Share on FacebookShare on Twitter


import dataclasses

 

import torch

import torch.nn as nn

import torch.nn.practical as F

from torch import Tensor

 

 

@dataclasses.dataclass

class LlamaConfig:

    “”“Outline Llama mannequin hyperparameters.”“”

    vocab_size: int = 50000  # Measurement of the tokenizer vocabulary

    max_position_embeddings: int = 2048  # Most sequence size

    hidden_size: int = 768  # Dimension of hidden layers

    intermediate_size: int = 4*768  # Dimension of MLP’s hidden layer

    num_hidden_layers: int = 12  # Variety of transformer layers

    num_attention_heads: int = 12  # Variety of consideration heads

    num_key_value_heads: int = 3  # Variety of key-value heads for GQA

 

 

def rotate_half(x: Tensor) -> Tensor:

    “”“Rotates half the hidden dims of the enter.

 

    This can be a helper operate for rotary place embeddings (RoPE).

    For a tensor of form (…, d), it returns a tensor the place the final

    d/2 dimensions are rotated by swapping and negating.

 

    Args:

        x: Enter tensor of form (…, d)

 

    Returns:

        Tensor of similar form with rotated final dimension

    ““”

    x1, x2 = x.chunk(2, dim=–1)

    return torch.cat((–x2, x1), dim=–1)  # Concatenate with rotation

 

 

class RotaryPositionEncoding(nn.Module):

    “”“Rotary place encoding.”“”

 

    def __init__(self, dim: int, max_position_embeddings: int) -> None:

        “”“Initialize the RotaryPositionEncoding module

 

        Args:

            dim: The hidden dimension of the enter tensor to which RoPE is utilized

            max_position_embeddings: The utmost sequence size of the enter tensor

        ““”

        tremendous().__init__()

        self.dim = dim

        self.max_position_embeddings = max_position_embeddings

        # compute a matrix of ntheta_i

        N = 10_000.0

        inv_freq = 1.0 / (N ** (torch.arange(0, dim, 2) / dim))

        inv_freq = torch.cat((inv_freq, inv_freq), dim=–1)

        place = torch.arange(max_position_embeddings)

        sinusoid_inp = torch.outer(place, inv_freq)

        # save cosine and sine matrices as buffers, not parameters

        self.register_buffer(“cos”, sinusoid_inp.cos())

        self.register_buffer(“sin”, sinusoid_inp.sin())

 

    def ahead(self, x: Tensor) -> Tensor:

        “”“Apply RoPE to tensor x

 

        Args:

            x: Enter tensor of form (batch_size, seq_length, num_heads, head_dim)

 

        Returns:

            Output tensor of form (batch_size, seq_length, num_heads, head_dim)

        ““”

        batch_size, seq_len, num_heads, head_dim = x.form

        dtype = x.dtype

        # rework the cosine and sine matrices to 4D tensor and the identical dtype as x

        cos = self.cos.to(dtype)[:seq_len].view(1, seq_len, 1, –1)

        sin = self.sin.to(dtype)[:seq_len].view(1, seq_len, 1, –1)

        # apply RoPE to x

        output = (x * cos) + (rotate_half(x) * sin)

        return output

 

 

class LlamaAttention(nn.Module):

    “”“Grouped-query consideration with rotary embeddings.”“”

 

    def __init__(self, config: LlamaConfig) -> None:

        tremendous().__init__()

        self.hidden_size = config.hidden_size

        self.num_heads = config.num_attention_heads

        self.head_dim = self.hidden_size // self.num_heads

        self.num_kv_heads = config.num_key_value_heads  # GQA: H_kv < H_q

 

        # hidden_size should be divisible by num_heads

        assert (self.head_dim * self.num_heads) == self.hidden_measurement

 

        # Linear layers for Q, Okay, V projections

        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)

        self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)

        self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)

        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

 

    def ahead(self, hidden_states: Tensor, rope: RotaryPositionEncoding, attn_mask: Tensor) -> Tensor:

        bs, seq_len, dim = hidden_states.measurement()

 

        # Undertaking inputs to Q, Okay, V

        query_states = self.q_proj(hidden_states).view(bs, seq_len, self.num_heads, self.head_dim)

        key_states = self.k_proj(hidden_states).view(bs, seq_len, self.num_kv_heads, self.head_dim)

        value_states = self.v_proj(hidden_states).view(bs, seq_len, self.num_kv_heads, self.head_dim)

 

        # Apply rotary place embeddings

        query_states = rope(query_states)

        key_states = rope(key_states)

 

        # Transpose tensors from BSHD to BHSD dimension for scaled_dot_product_attention

        query_states = query_states.transpose(1, 2)

        key_states = key_states.transpose(1, 2)

        value_states = value_states.transpose(1, 2)

 

        # Use PyTorch’s optimized consideration implementation

        # setting is_causal=True is incompatible with setting express consideration masks

        attn_output = F.scaled_dot_product_attention(

            query_states,

            key_states,

            value_states,

            attn_mask=attn_mask,

            dropout_p=0.0,

            enable_gqa=True,

        )

 

        # Transpose output tensor from BHSD to BSHD dimension, reshape to 3D, after which undertaking output

        attn_output = attn_output.transpose(1, 2).reshape(bs, seq_len, self.hidden_size)

        attn_output = self.o_proj(attn_output)

        return attn_output

 

 

class LlamaMLP(nn.Module):

    “”“Feed-forward community with SwiGLU activation.”“”

 

    def __init__(self, config: LlamaConfig) -> None:

        tremendous().__init__()

        # Two parallel projections for SwiGLU

        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)

        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)

        self.act_fn = F.silu  # SwiGLU activation operate

        # Undertaking again to hidden measurement

        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)

 

    def ahead(self, x: Tensor) -> Tensor:

        # SwiGLU activation: multiply gate and up-projected inputs

        gate = self.act_fn(self.gate_proj(x))

        up = self.up_proj(x)

        return self.down_proj(gate * up)

 

 

class LlamaDecoderLayer(nn.Module):

    “”“Single transformer layer for a Llama mannequin.”“”

 

    def __init__(self, config: LlamaConfig) -> None:

        tremendous().__init__()

        self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=1e–5)

        self.self_attn = LlamaAttention(config)

        self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, eps=1e–5)

        self.mlp = LlamaMLP(config)

 

    def ahead(self, hidden_states: Tensor, rope: RotaryPositionEncoding, attn_mask: Tensor) -> Tensor:

        # First residual block: Self-attention

        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        attn_outputs = self.self_attn(hidden_states, rope=rope, attn_mask=attn_mask)

        hidden_states = attn_outputs + residual

 

        # Second residual block: MLP

        residual = hidden_states

        hidden_states = self.post_attention_layernorm(hidden_states)

        hidden_states = self.mlp(hidden_states) + residual

        return hidden_states

 

 

class LlamaModel(nn.Module):

    “”“The total Llama mannequin with none pretraining heads.”“”

 

    def __init__(self, config: LlamaConfig) -> None:

        tremendous().__init__()

        self.rotary_emb = RotaryPositionEncoding(

            config.hidden_size // config.num_attention_heads,

            config.max_position_embeddings,

        )

 

        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)

        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])

        self.norm = nn.RMSNorm(config.hidden_size, eps=1e–5)

 

    def ahead(self, input_ids: Tensor, attn_mask: Tensor) -> Tensor:

        # Convert enter token IDs to embeddings

        hidden_states = self.embed_tokens(input_ids)

        # Course of via all transformer layers, then the ultimate norm layer

        for layer in self.layers:

            hidden_states = layer(hidden_states, rope=self.rotary_emb, attn_mask=attn_mask)

        hidden_states = self.norm(hidden_states)

        # Return the ultimate hidden states

        return hidden_states

 

 

class LlamaForPretraining(nn.Module):

    def __init__(self, config: LlamaConfig) -> None:

        tremendous().__init__()

        self.base_model = LlamaModel(config)

        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

 

    def ahead(self, input_ids: Tensor, attn_mask: Tensor) -> Tensor:

        hidden_states = self.base_model(input_ids, attn_mask)

        return self.lm_head(hidden_states)

 

 

def create_causal_mask(seq_len: int, gadget: torch.gadget, dtype: torch.dtype = torch.float32) -> Tensor:

    “”“Create a causal masks for self-attention.

 

    Args:

        seq_len: Size of the sequence

        gadget: Gadget to create the masks on

        dtype: Information kind of the masks

 

    Returns:

        Causal masks of form (seq_len, seq_len)

    ““”

    masks = torch.full((seq_len, seq_len), float(‘-inf’), gadget=gadget, dtype=dtype)

                .triu(diagonal=1)

    return masks

 

def create_padding_mask(batch, padding_token_id, gadget: torch.gadget, dtype: torch.dtype = torch.float32):

    “”“Create a padding masks for a batch of sequences for self-attention.

 

    Args:

        batch: Batch of sequences, form (batch_size, seq_len)

        padding_token_id: ID of the padding token

 

    Returns:

        Padding masks of form (batch_size, 1, seq_len, seq_len)

    ““”

    padded = torch.zeros_like(batch, gadget=gadget, dtype=dtype)

                  .masked_fill(batch == padding_token_id, float(‘-inf’))

    masks = padded[:,:,None] + padded[:,None,:]

    return masks[:, None, :, :]

 

 

# Create mannequin with default config

model_config = LlamaConfig()

gadget = torch.gadget(“cuda”) if torch.cuda.is_available() else torch.gadget(“cpu”)

mannequin = LlamaForPretraining(model_config).to(gadget)

# print the mannequin measurement

print(f“Mannequin parameters measurement: {sum(p.numel() for p in mannequin.parameters()) / 1024**2:.2f} M”)

print(f“Mannequin buffers measurement: {sum(p.numel() for p in mannequin.buffers()) / 1024**2:.2f} M”)

 

# Create a random tensor

PAD_TOKEN_ID = 0

bs, seq_len = 5, 13

x = torch.randint(1, model_config.vocab_size, (bs, seq_len), dtype=torch.int32, gadget=gadget)

# set random size of padding tokens on the finish of every sequence

for i, pad_length in enumerate([4, 1, 0, 3, 8]):

    if pad_length > 0:

        x[i, –pad_length:] = PAD_TOKEN_ID

# Create causal and padding masks

causal_mask = create_causal_mask(seq_len, gadget)

padding_mask = create_padding_mask(x, PAD_TOKEN_ID, gadget)

attn_mask = causal_mask + padding_mask

print(f“Enter ids: {x}”)

print(f“Consideration masks: {attn_mask}”)

 

# Run the mannequin

output = mannequin(x, attn_mask)

print(“OK”)

Tags: CreatingGPTLlamamodelNextTokenprediction
Admin

Admin

Next Post
3 Hidden Costco Gems You’ve got Been Lacking Out On

3 Hidden Costco Gems You've got Been Lacking Out On

Leave a Reply Cancel reply

Your email address will not be published. Required fields are marked *

Recommended.

Palantir indicators a cope with The Nuclear Firm beneath which the startup can pay Palantir $100M over 5 years to develop AI software program for the nuclear business (Miquela Thornton/Bloomberg)

Massive Tech’s dominance on the S&P 500 is turning into extra excessive; the eight largest tech shares account for 60% of the good points for the reason that market bottomed in April 2025 (George Steer/Monetary Instances)

November 2, 2025
Tips on how to Put together for CSS-Particular Interview Questions

Tips on how to Put together for CSS-Particular Interview Questions

August 9, 2025

Trending.

AI-Assisted Menace Actor Compromises 600+ FortiGate Gadgets in 55 Nations

AI-Assisted Menace Actor Compromises 600+ FortiGate Gadgets in 55 Nations

February 23, 2026
10 tricks to begin getting ready! • Yoast

10 tricks to begin getting ready! • Yoast

July 21, 2025
Exporting a Material Simulation from Blender to an Interactive Three.js Scene

Exporting a Material Simulation from Blender to an Interactive Three.js Scene

August 20, 2025
Moonshot AI Releases 𝑨𝒕𝒕𝒆𝒏𝒕𝒊𝒐𝒏 𝑹𝒆𝒔𝒊𝒅𝒖𝒂𝒍𝒔 to Exchange Mounted Residual Mixing with Depth-Sensible Consideration for Higher Scaling in Transformers

Moonshot AI Releases 𝑨𝒕𝒕𝒆𝒏𝒕𝒊𝒐𝒏 𝑹𝒆𝒔𝒊𝒅𝒖𝒂𝒍𝒔 to Exchange Mounted Residual Mixing with Depth-Sensible Consideration for Higher Scaling in Transformers

March 16, 2026
Design Has By no means Been Extra Vital: Inside Shopify’s Acquisition of Molly

Design Has By no means Been Extra Vital: Inside Shopify’s Acquisition of Molly

September 8, 2025

AimactGrow

Welcome to AimactGrow, your ultimate source for all things technology! Our mission is to provide insightful, up-to-date content on the latest advancements in technology, coding, gaming, digital marketing, SEO, cybersecurity, and artificial intelligence (AI).

Categories

  • AI
  • Coding
  • Cybersecurity
  • Digital marketing
  • Gaming
  • SEO
  • Technology

Recent News

Paddling upstream | Seth’s Weblog

Inexperienced flags | Seth’s Weblog

March 17, 2026
You Can Simply Trick AI Chatbots Like ChatGPT And Gemini

You Can Simply Trick AI Chatbots Like ChatGPT And Gemini

March 17, 2026
  • About Us
  • Privacy Policy
  • Disclaimer
  • Contact Us

© 2025 https://blog.aimactgrow.com/ - All Rights Reserved

No Result
View All Result
  • Home
  • Technology
  • AI
  • SEO
  • Coding
  • Gaming
  • Cybersecurity
  • Digital marketing

© 2025 https://blog.aimactgrow.com/ - All Rights Reserved