• About Us
  • Privacy Policy
  • Disclaimer
  • Contact Us
AimactGrow
  • Home
  • Technology
  • AI
  • SEO
  • Coding
  • Gaming
  • Cybersecurity
  • Digital marketing
No Result
View All Result
  • Home
  • Technology
  • AI
  • SEO
  • Coding
  • Gaming
  • Cybersecurity
  • Digital marketing
No Result
View All Result
AimactGrow
No Result
View All Result

Making a Llama or GPT Mannequin for Subsequent-Token Prediction

Admin by Admin
January 16, 2026
Home AI
Share on FacebookShare on Twitter


import dataclasses

 

import torch

import torch.nn as nn

import torch.nn.practical as F

from torch import Tensor

 

 

@dataclasses.dataclass

class LlamaConfig:

    “”“Outline Llama mannequin hyperparameters.”“”

    vocab_size: int = 50000  # Measurement of the tokenizer vocabulary

    max_position_embeddings: int = 2048  # Most sequence size

    hidden_size: int = 768  # Dimension of hidden layers

    intermediate_size: int = 4*768  # Dimension of MLP’s hidden layer

    num_hidden_layers: int = 12  # Variety of transformer layers

    num_attention_heads: int = 12  # Variety of consideration heads

    num_key_value_heads: int = 3  # Variety of key-value heads for GQA

 

 

def rotate_half(x: Tensor) -> Tensor:

    “”“Rotates half the hidden dims of the enter.

 

    This can be a helper operate for rotary place embeddings (RoPE).

    For a tensor of form (…, d), it returns a tensor the place the final

    d/2 dimensions are rotated by swapping and negating.

 

    Args:

        x: Enter tensor of form (…, d)

 

    Returns:

        Tensor of similar form with rotated final dimension

    ““”

    x1, x2 = x.chunk(2, dim=–1)

    return torch.cat((–x2, x1), dim=–1)  # Concatenate with rotation

 

 

class RotaryPositionEncoding(nn.Module):

    “”“Rotary place encoding.”“”

 

    def __init__(self, dim: int, max_position_embeddings: int) -> None:

        “”“Initialize the RotaryPositionEncoding module

 

        Args:

            dim: The hidden dimension of the enter tensor to which RoPE is utilized

            max_position_embeddings: The utmost sequence size of the enter tensor

        ““”

        tremendous().__init__()

        self.dim = dim

        self.max_position_embeddings = max_position_embeddings

        # compute a matrix of ntheta_i

        N = 10_000.0

        inv_freq = 1.0 / (N ** (torch.arange(0, dim, 2) / dim))

        inv_freq = torch.cat((inv_freq, inv_freq), dim=–1)

        place = torch.arange(max_position_embeddings)

        sinusoid_inp = torch.outer(place, inv_freq)

        # save cosine and sine matrices as buffers, not parameters

        self.register_buffer(“cos”, sinusoid_inp.cos())

        self.register_buffer(“sin”, sinusoid_inp.sin())

 

    def ahead(self, x: Tensor) -> Tensor:

        “”“Apply RoPE to tensor x

 

        Args:

            x: Enter tensor of form (batch_size, seq_length, num_heads, head_dim)

 

        Returns:

            Output tensor of form (batch_size, seq_length, num_heads, head_dim)

        ““”

        batch_size, seq_len, num_heads, head_dim = x.form

        dtype = x.dtype

        # rework the cosine and sine matrices to 4D tensor and the identical dtype as x

        cos = self.cos.to(dtype)[:seq_len].view(1, seq_len, 1, –1)

        sin = self.sin.to(dtype)[:seq_len].view(1, seq_len, 1, –1)

        # apply RoPE to x

        output = (x * cos) + (rotate_half(x) * sin)

        return output

 

 

class LlamaAttention(nn.Module):

    “”“Grouped-query consideration with rotary embeddings.”“”

 

    def __init__(self, config: LlamaConfig) -> None:

        tremendous().__init__()

        self.hidden_size = config.hidden_size

        self.num_heads = config.num_attention_heads

        self.head_dim = self.hidden_size // self.num_heads

        self.num_kv_heads = config.num_key_value_heads  # GQA: H_kv < H_q

 

        # hidden_size should be divisible by num_heads

        assert (self.head_dim * self.num_heads) == self.hidden_measurement

 

        # Linear layers for Q, Okay, V projections

        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)

        self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)

        self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)

        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

 

    def ahead(self, hidden_states: Tensor, rope: RotaryPositionEncoding, attn_mask: Tensor) -> Tensor:

        bs, seq_len, dim = hidden_states.measurement()

 

        # Undertaking inputs to Q, Okay, V

        query_states = self.q_proj(hidden_states).view(bs, seq_len, self.num_heads, self.head_dim)

        key_states = self.k_proj(hidden_states).view(bs, seq_len, self.num_kv_heads, self.head_dim)

        value_states = self.v_proj(hidden_states).view(bs, seq_len, self.num_kv_heads, self.head_dim)

 

        # Apply rotary place embeddings

        query_states = rope(query_states)

        key_states = rope(key_states)

 

        # Transpose tensors from BSHD to BHSD dimension for scaled_dot_product_attention

        query_states = query_states.transpose(1, 2)

        key_states = key_states.transpose(1, 2)

        value_states = value_states.transpose(1, 2)

 

        # Use PyTorch’s optimized consideration implementation

        # setting is_causal=True is incompatible with setting express consideration masks

        attn_output = F.scaled_dot_product_attention(

            query_states,

            key_states,

            value_states,

            attn_mask=attn_mask,

            dropout_p=0.0,

            enable_gqa=True,

        )

 

        # Transpose output tensor from BHSD to BSHD dimension, reshape to 3D, after which undertaking output

        attn_output = attn_output.transpose(1, 2).reshape(bs, seq_len, self.hidden_size)

        attn_output = self.o_proj(attn_output)

        return attn_output

 

 

class LlamaMLP(nn.Module):

    “”“Feed-forward community with SwiGLU activation.”“”

 

    def __init__(self, config: LlamaConfig) -> None:

        tremendous().__init__()

        # Two parallel projections for SwiGLU

        self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)

        self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)

        self.act_fn = F.silu  # SwiGLU activation operate

        # Undertaking again to hidden measurement

        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)

 

    def ahead(self, x: Tensor) -> Tensor:

        # SwiGLU activation: multiply gate and up-projected inputs

        gate = self.act_fn(self.gate_proj(x))

        up = self.up_proj(x)

        return self.down_proj(gate * up)

 

 

class LlamaDecoderLayer(nn.Module):

    “”“Single transformer layer for a Llama mannequin.”“”

 

    def __init__(self, config: LlamaConfig) -> None:

        tremendous().__init__()

        self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=1e–5)

        self.self_attn = LlamaAttention(config)

        self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, eps=1e–5)

        self.mlp = LlamaMLP(config)

 

    def ahead(self, hidden_states: Tensor, rope: RotaryPositionEncoding, attn_mask: Tensor) -> Tensor:

        # First residual block: Self-attention

        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        attn_outputs = self.self_attn(hidden_states, rope=rope, attn_mask=attn_mask)

        hidden_states = attn_outputs + residual

 

        # Second residual block: MLP

        residual = hidden_states

        hidden_states = self.post_attention_layernorm(hidden_states)

        hidden_states = self.mlp(hidden_states) + residual

        return hidden_states

 

 

class LlamaModel(nn.Module):

    “”“The total Llama mannequin with none pretraining heads.”“”

 

    def __init__(self, config: LlamaConfig) -> None:

        tremendous().__init__()

        self.rotary_emb = RotaryPositionEncoding(

            config.hidden_size // config.num_attention_heads,

            config.max_position_embeddings,

        )

 

        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)

        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])

        self.norm = nn.RMSNorm(config.hidden_size, eps=1e–5)

 

    def ahead(self, input_ids: Tensor, attn_mask: Tensor) -> Tensor:

        # Convert enter token IDs to embeddings

        hidden_states = self.embed_tokens(input_ids)

        # Course of via all transformer layers, then the ultimate norm layer

        for layer in self.layers:

            hidden_states = layer(hidden_states, rope=self.rotary_emb, attn_mask=attn_mask)

        hidden_states = self.norm(hidden_states)

        # Return the ultimate hidden states

        return hidden_states

 

 

class LlamaForPretraining(nn.Module):

    def __init__(self, config: LlamaConfig) -> None:

        tremendous().__init__()

        self.base_model = LlamaModel(config)

        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

 

    def ahead(self, input_ids: Tensor, attn_mask: Tensor) -> Tensor:

        hidden_states = self.base_model(input_ids, attn_mask)

        return self.lm_head(hidden_states)

 

 

def create_causal_mask(seq_len: int, gadget: torch.gadget, dtype: torch.dtype = torch.float32) -> Tensor:

    “”“Create a causal masks for self-attention.

 

    Args:

        seq_len: Size of the sequence

        gadget: Gadget to create the masks on

        dtype: Information kind of the masks

 

    Returns:

        Causal masks of form (seq_len, seq_len)

    ““”

    masks = torch.full((seq_len, seq_len), float(‘-inf’), gadget=gadget, dtype=dtype)

                .triu(diagonal=1)

    return masks

 

def create_padding_mask(batch, padding_token_id, gadget: torch.gadget, dtype: torch.dtype = torch.float32):

    “”“Create a padding masks for a batch of sequences for self-attention.

 

    Args:

        batch: Batch of sequences, form (batch_size, seq_len)

        padding_token_id: ID of the padding token

 

    Returns:

        Padding masks of form (batch_size, 1, seq_len, seq_len)

    ““”

    padded = torch.zeros_like(batch, gadget=gadget, dtype=dtype)

                  .masked_fill(batch == padding_token_id, float(‘-inf’))

    masks = padded[:,:,None] + padded[:,None,:]

    return masks[:, None, :, :]

 

 

# Create mannequin with default config

model_config = LlamaConfig()

gadget = torch.gadget(“cuda”) if torch.cuda.is_available() else torch.gadget(“cpu”)

mannequin = LlamaForPretraining(model_config).to(gadget)

# print the mannequin measurement

print(f“Mannequin parameters measurement: {sum(p.numel() for p in mannequin.parameters()) / 1024**2:.2f} M”)

print(f“Mannequin buffers measurement: {sum(p.numel() for p in mannequin.buffers()) / 1024**2:.2f} M”)

 

# Create a random tensor

PAD_TOKEN_ID = 0

bs, seq_len = 5, 13

x = torch.randint(1, model_config.vocab_size, (bs, seq_len), dtype=torch.int32, gadget=gadget)

# set random size of padding tokens on the finish of every sequence

for i, pad_length in enumerate([4, 1, 0, 3, 8]):

    if pad_length > 0:

        x[i, –pad_length:] = PAD_TOKEN_ID

# Create causal and padding masks

causal_mask = create_causal_mask(seq_len, gadget)

padding_mask = create_padding_mask(x, PAD_TOKEN_ID, gadget)

attn_mask = causal_mask + padding_mask

print(f“Enter ids: {x}”)

print(f“Consideration masks: {attn_mask}”)

 

# Run the mannequin

output = mannequin(x, attn_mask)

print(“OK”)

Tags: CreatingGPTLlamamodelNextTokenprediction
Admin

Admin

Next Post
3 Hidden Costco Gems You’ve got Been Lacking Out On

3 Hidden Costco Gems You've got Been Lacking Out On

Leave a Reply Cancel reply

Your email address will not be published. Required fields are marked *

Recommended.

PictoPop Video Generator Overview: I Examined it for a Month

PictoPop Video Generator Overview: I Examined it for a Month

December 20, 2025
Amazon Liquidates Bose Headphones at 50% Off, Now Cheaper Than Mid-Vary No-Identify Fashions

Amazon Liquidates Bose Headphones at 50% Off, Now Cheaper Than Mid-Vary No-Identify Fashions

November 18, 2025

Trending.

The way to Clear up the Wall Puzzle in The place Winds Meet

The way to Clear up the Wall Puzzle in The place Winds Meet

November 16, 2025
Researchers Uncover Crucial GitHub CVE-2026-3854 RCE Flaw Exploitable by way of Single Git Push

Researchers Uncover Crucial GitHub CVE-2026-3854 RCE Flaw Exploitable by way of Single Git Push

April 29, 2026
Google Introduces Simula: A Reasoning-First Framework for Producing Controllable, Scalable Artificial Datasets Throughout Specialised AI Domains

Google Introduces Simula: A Reasoning-First Framework for Producing Controllable, Scalable Artificial Datasets Throughout Specialised AI Domains

April 21, 2026
Google DeepMind Introduces Decoupled DiLoCo: An Asynchronous Coaching Structure Reaching 88% Goodput Below Excessive {Hardware} Failure Charges

Google DeepMind Introduces Decoupled DiLoCo: An Asynchronous Coaching Structure Reaching 88% Goodput Below Excessive {Hardware} Failure Charges

April 24, 2026
5 AI Compute Architectures Each Engineer Ought to Know: CPUs, GPUs, TPUs, NPUs, and LPUs In contrast

5 AI Compute Architectures Each Engineer Ought to Know: CPUs, GPUs, TPUs, NPUs, and LPUs In contrast

April 10, 2026

AimactGrow

Welcome to AimactGrow, your ultimate source for all things technology! Our mission is to provide insightful, up-to-date content on the latest advancements in technology, coding, gaming, digital marketing, SEO, cybersecurity, and artificial intelligence (AI).

Categories

  • AI
  • Coding
  • Cybersecurity
  • Digital marketing
  • Gaming
  • SEO
  • Technology

Recent News

A profile of OpenAI CFO Sarah Friar, who sources say helped preserve OpenAI’s Microsoft deal on monitor and has privately steered ready till 2027 for an IPO (Wall Road Journal)

A profile of OpenAI CFO Sarah Friar, who sources say helped preserve OpenAI’s Microsoft deal on monitor and has privately steered ready till 2027 for an IPO (Wall Road Journal)

May 2, 2026
Huge Fb Phishing Operation Leverages AppSheet, Netlify, and Telegram

Huge Fb Phishing Operation Leverages AppSheet, Netlify, and Telegram

May 2, 2026
  • About Us
  • Privacy Policy
  • Disclaimer
  • Contact Us

© 2025 https://blog.aimactgrow.com/ - All Rights Reserved

No Result
View All Result
  • Home
  • Technology
  • AI
  • SEO
  • Coding
  • Gaming
  • Cybersecurity
  • Digital marketing

© 2025 https://blog.aimactgrow.com/ - All Rights Reserved