• About Us
  • Privacy Policy
  • Disclaimer
  • Contact Us
AimactGrow
  • Home
  • Technology
  • AI
  • SEO
  • Coding
  • Gaming
  • Cybersecurity
  • Digital marketing
No Result
View All Result
  • Home
  • Technology
  • AI
  • SEO
  • Coding
  • Gaming
  • Cybersecurity
  • Digital marketing
No Result
View All Result
AimactGrow
No Result
View All Result

Practice Your Massive Mannequin on A number of GPUs with Totally Sharded Knowledge Parallelism

Admin by Admin
January 2, 2026
Home AI
Share on FacebookShare on Twitter


import dataclasses

import functools

import os

Ā 

import datasets

import tokenizers

import torch

import torch.distributed as dist

import torch.nn as nn

import torch.nn.practical as F

import torch.optim.lr_scheduler as lr_scheduler

import tqdm

from torch import Tensor

from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (

Ā Ā Ā Ā apply_activation_checkpointing,

Ā Ā Ā Ā checkpoint_wrapper,

)

from torch.distributed.checkpoint import load, save

from torch.distributed.checkpoint.state_dict import (

Ā Ā Ā Ā StateDictOptions,

Ā Ā Ā Ā get_state_dict,

Ā Ā Ā Ā set_state_dict,

)

from torch.distributed.fsdp import (

Ā Ā Ā Ā CPUOffloadPolicy,

Ā Ā Ā Ā FSDPModule,

Ā Ā Ā Ā MixedPrecisionPolicy,

Ā Ā Ā Ā fully_shard,

)

from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy

from torch.utils.information.distributed import DistributedSampler

Ā 

Ā 

# Construct the mannequin

@dataclasses.dataclass

class LlamaConfig:

Ā Ā Ā Ā “”“Outline Llama mannequin hyperparameters.”“”

Ā Ā Ā Ā vocab_size: int = 50000Ā Ā # Dimension of the tokenizer vocabulary

Ā Ā Ā Ā max_position_embeddings: int = 2048Ā Ā # Most sequence size

Ā Ā Ā Ā hidden_size: int = 768Ā Ā # Dimension of hidden layers

Ā Ā Ā Ā intermediate_size: int = 4*768Ā Ā # Dimension of MLP’s hidden layer

Ā Ā Ā Ā num_hidden_layers: int = 12Ā Ā # Variety of transformer layers

Ā Ā Ā Ā num_attention_heads: int = 12Ā Ā # Variety of consideration heads

Ā Ā Ā Ā num_key_value_heads: int = 3Ā Ā # Variety of key-value heads for GQA

Ā 

Ā 

class RotaryPositionEncoding(nn.Module):

Ā Ā Ā Ā “”“Rotary place encoding.”“”

Ā 

Ā Ā Ā Ā def __init__(self, dim: int, max_position_embeddings: int) -> None:

Ā Ā Ā Ā Ā Ā Ā Ā “”“Initialize the RotaryPositionEncoding module.

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā Args:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā dim: The hidden dimension of the enter tensor to which RoPE is utilized

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā max_position_embeddings: The utmost sequence size of the enter tensor

Ā Ā Ā Ā Ā Ā Ā Ā ““”

Ā Ā Ā Ā Ā Ā Ā Ā tremendous().__init__()

Ā Ā Ā Ā Ā Ā Ā Ā self.dim = dim

Ā Ā Ā Ā Ā Ā Ā Ā self.max_position_embeddings = max_position_embeddings

Ā Ā Ā Ā Ā Ā Ā Ā # compute a matrix of ntheta_i

Ā Ā Ā Ā Ā Ā Ā Ā N = 10_000.0

Ā Ā Ā Ā Ā Ā Ā Ā inv_freq = 1.0 / (N ** (torch.arange(0, dim, 2) / dim))

Ā Ā Ā Ā Ā Ā Ā Ā inv_freq = torch.cat((inv_freq, inv_freq), dim=–1)

Ā Ā Ā Ā Ā Ā Ā Ā place = torch.arange(max_position_embeddings)

Ā Ā Ā Ā Ā Ā Ā Ā sinusoid_inp = torch.outer(place, inv_freq)

Ā Ā Ā Ā Ā Ā Ā Ā # save cosine and sine matrices as buffers, not parameters

Ā Ā Ā Ā Ā Ā Ā Ā self.register_buffer(“cos”, sinusoid_inp.cos())

Ā Ā Ā Ā Ā Ā Ā Ā self.register_buffer(“sin”, sinusoid_inp.sin())

Ā 

Ā Ā Ā Ā def ahead(self, x: Tensor) -> Tensor:

Ā Ā Ā Ā Ā Ā Ā Ā “”“Apply RoPE to tensor x.

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā Args:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā x: Enter tensor of form (batch_size, seq_length, num_heads, head_dim)

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā Returns:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Output tensor of form (batch_size, seq_length, num_heads, head_dim)

Ā Ā Ā Ā Ā Ā Ā Ā ““”

Ā Ā Ā Ā Ā Ā Ā Ā batch_size, seq_len, num_heads, head_dim = x.form

Ā Ā Ā Ā Ā Ā Ā Ā gadget = x.gadget

Ā Ā Ā Ā Ā Ā Ā Ā dtype = x.dtype

Ā Ā Ā Ā Ā Ā Ā Ā # remodel the cosine and sine matrices to 4D tensor and the identical dtype as x

Ā Ā Ā Ā Ā Ā Ā Ā cos = self.cos.to(gadget, dtype)[:seq_len].view(1, seq_len, 1, –1)

Ā Ā Ā Ā Ā Ā Ā Ā sin = self.sin.to(gadget, dtype)[:seq_len].view(1, seq_len, 1, –1)

Ā Ā Ā Ā Ā Ā Ā Ā # apply RoPE to x

Ā Ā Ā Ā Ā Ā Ā Ā x1, x2 = x.chunk(2, dim=–1)

Ā Ā Ā Ā Ā Ā Ā Ā rotated = torch.cat((–x2, x1), dim=–1)

Ā Ā Ā Ā Ā Ā Ā Ā output = (x * cos) + (rotated * sin)

Ā Ā Ā Ā Ā Ā Ā Ā return output

Ā 

Ā 

class LlamaAttention(nn.Module):

Ā Ā Ā Ā “”“Grouped-query consideration with rotary embeddings.”“”

Ā 

Ā Ā Ā Ā def __init__(self, config: LlamaConfig) -> None:

Ā Ā Ā Ā Ā Ā Ā Ā tremendous().__init__()

Ā Ā Ā Ā Ā Ā Ā Ā self.hidden_size = config.hidden_size

Ā Ā Ā Ā Ā Ā Ā Ā self.num_heads = config.num_attention_heads

Ā Ā Ā Ā Ā Ā Ā Ā self.head_dim = self.hidden_size // self.num_heads

Ā Ā Ā Ā Ā Ā Ā Ā self.num_kv_heads = config.num_key_value_headsĀ Ā # GQA: H_kv < H_q

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # hidden_size should be divisible by num_heads

Ā Ā Ā Ā Ā Ā Ā Ā assert (self.head_dim * self.num_heads) == self.hidden_dimension

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # Linear layers for Q, Okay, V projections

Ā Ā Ā Ā Ā Ā Ā Ā self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)

Ā Ā Ā Ā Ā Ā Ā Ā self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)

Ā Ā Ā Ā Ā Ā Ā Ā self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)

Ā Ā Ā Ā Ā Ā Ā Ā self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

Ā 

Ā Ā Ā Ā def reset_parameters(self):

Ā Ā Ā Ā Ā Ā Ā Ā self.q_proj.reset_parameters()

Ā Ā Ā Ā Ā Ā Ā Ā self.k_proj.reset_parameters()

Ā Ā Ā Ā Ā Ā Ā Ā self.v_proj.reset_parameters()

Ā Ā Ā Ā Ā Ā Ā Ā self.o_proj.reset_parameters()

Ā 

Ā Ā Ā Ā def ahead(self, hidden_states: Tensor, rope: RotaryPositionEncoding, attn_mask: Tensor) -> Tensor:

Ā Ā Ā Ā Ā Ā Ā Ā bs, seq_len, dim = hidden_states.dimension()

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # Venture inputs to Q, Okay, V

Ā Ā Ā Ā Ā Ā Ā Ā query_states = self.q_proj(hidden_states).view(bs, seq_len, self.num_heads, self.head_dim)

Ā Ā Ā Ā Ā Ā Ā Ā key_states = self.k_proj(hidden_states).view(bs, seq_len, self.num_kv_heads, self.head_dim)

Ā Ā Ā Ā Ā Ā Ā Ā value_states = self.v_proj(hidden_states).view(bs, seq_len, self.num_kv_heads, self.head_dim)

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # Apply rotary place embeddings

Ā Ā Ā Ā Ā Ā Ā Ā query_states = rope(query_states)

Ā Ā Ā Ā Ā Ā Ā Ā key_states = rope(key_states)

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # Transpose tensors from BSHD to BHSD dimension for scaled_dot_product_attention

Ā Ā Ā Ā Ā Ā Ā Ā query_states = query_states.transpose(1, 2)

Ā Ā Ā Ā Ā Ā Ā Ā key_states = key_states.transpose(1, 2)

Ā Ā Ā Ā Ā Ā Ā Ā value_states = value_states.transpose(1, 2)

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # Use PyTorch’s optimized consideration implementation

Ā Ā Ā Ā Ā Ā Ā Ā # setting is_causal=True is incompatible with setting specific consideration masks

Ā Ā Ā Ā Ā Ā Ā Ā attn_output = F.scaled_dot_product_attention(

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā query_states,

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā key_states,

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā value_states,

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā attn_mask=attn_mask,

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā dropout_p=0.0,

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā enable_gqa=True,

Ā Ā Ā Ā Ā Ā Ā Ā )

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # Transpose output tensor from BHSD to BSHD dimension, reshape to 3D, after which challenge output

Ā Ā Ā Ā Ā Ā Ā Ā attn_output = attn_output.transpose(1, 2).reshape(bs, seq_len, self.hidden_size)

Ā Ā Ā Ā Ā Ā Ā Ā attn_output = self.o_proj(attn_output)

Ā Ā Ā Ā Ā Ā Ā Ā return attn_output

Ā 

Ā 

class LlamaMLP(nn.Module):

Ā Ā Ā Ā “”“Feed-forward community with SwiGLU activation.”“”

Ā 

Ā Ā Ā Ā def __init__(self, config: LlamaConfig) -> None:

Ā Ā Ā Ā Ā Ā Ā Ā tremendous().__init__()

Ā Ā Ā Ā Ā Ā Ā Ā # Two parallel projections for SwiGLU

Ā Ā Ā Ā Ā Ā Ā Ā self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)

Ā Ā Ā Ā Ā Ā Ā Ā self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)

Ā Ā Ā Ā Ā Ā Ā Ā self.act_fn = F.siluĀ Ā # SwiGLU activation perform

Ā Ā Ā Ā Ā Ā Ā Ā # Venture again to hidden dimension

Ā Ā Ā Ā Ā Ā Ā Ā self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)

Ā 

Ā Ā Ā Ā def reset_parameters(self):

Ā Ā Ā Ā Ā Ā Ā Ā self.gate_proj.reset_parameters()

Ā Ā Ā Ā Ā Ā Ā Ā self.up_proj.reset_parameters()

Ā Ā Ā Ā Ā Ā Ā Ā self.down_proj.reset_parameters()

Ā 

Ā Ā Ā Ā def ahead(self, x: Tensor) -> Tensor:

Ā Ā Ā Ā Ā Ā Ā Ā # SwiGLU activation: multiply gate and up-projected inputs

Ā Ā Ā Ā Ā Ā Ā Ā gate = self.act_fn(self.gate_proj(x))

Ā Ā Ā Ā Ā Ā Ā Ā up = self.up_proj(x)

Ā Ā Ā Ā Ā Ā Ā Ā return self.down_proj(gate * up)

Ā 

Ā 

class LlamaDecoderLayer(nn.Module):

Ā Ā Ā Ā “”“Single transformer layer for a Llama mannequin.”“”

Ā 

Ā Ā Ā Ā def __init__(self, config: LlamaConfig) -> None:

Ā Ā Ā Ā Ā Ā Ā Ā tremendous().__init__()

Ā Ā Ā Ā Ā Ā Ā Ā self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=1e–5)

Ā Ā Ā Ā Ā Ā Ā Ā self.self_attn = LlamaAttention(config)

Ā Ā Ā Ā Ā Ā Ā Ā self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, eps=1e–5)

Ā Ā Ā Ā Ā Ā Ā Ā self.mlp = LlamaMLP(config)

Ā 

Ā Ā Ā Ā def reset_parameters(self):

Ā Ā Ā Ā Ā Ā Ā Ā self.input_layernorm.reset_parameters()

Ā Ā Ā Ā Ā Ā Ā Ā self.self_attn.reset_parameters()

Ā Ā Ā Ā Ā Ā Ā Ā self.post_attention_layernorm.reset_parameters()

Ā Ā Ā Ā Ā Ā Ā Ā self.mlp.reset_parameters()

Ā 

Ā Ā Ā Ā def ahead(self, hidden_states: Tensor, rope: RotaryPositionEncoding, attn_mask: Tensor) -> Tensor:

Ā Ā Ā Ā Ā Ā Ā Ā # First residual block: Self-attention

Ā Ā Ā Ā Ā Ā Ā Ā residual = hidden_states

Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = self.input_layernorm(hidden_states)

Ā Ā Ā Ā Ā Ā Ā Ā attn_outputs = self.self_attn(hidden_states, rope=rope, attn_mask=attn_mask)

Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = attn_outputs + residual

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # Second residual block: MLP

Ā Ā Ā Ā Ā Ā Ā Ā residual = hidden_states

Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = self.post_attention_layernorm(hidden_states)

Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = self.mlp(hidden_states) + residual

Ā Ā Ā Ā Ā Ā Ā Ā return hidden_states

Ā 

Ā 

class LlamaModel(nn.Module):

Ā Ā Ā Ā “”“The total Llama mannequin with none pretraining heads.”“”

Ā 

Ā Ā Ā Ā def __init__(self, config: LlamaConfig) -> None:

Ā Ā Ā Ā Ā Ā Ā Ā tremendous().__init__()

Ā Ā Ā Ā Ā Ā Ā Ā self.rotary_emb = RotaryPositionEncoding(

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā config.hidden_size // config.num_attention_heads,

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā config.max_position_embeddings,

Ā Ā Ā Ā Ā Ā Ā Ā )

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)

Ā Ā Ā Ā Ā Ā Ā Ā self.layers = nn.ModuleList([

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)

Ā Ā Ā Ā Ā Ā Ā Ā ])

Ā Ā Ā Ā Ā Ā Ā Ā self.norm = nn.RMSNorm(config.hidden_size, eps=1e–5)

Ā 

Ā Ā Ā Ā def reset_parameters(self):

Ā Ā Ā Ā Ā Ā Ā Ā self.embed_tokens.reset_parameters()

Ā Ā Ā Ā Ā Ā Ā Ā for layer in self.layers:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā layer.reset_parameters()

Ā Ā Ā Ā Ā Ā Ā Ā self.norm.reset_parameters()

Ā 

Ā Ā Ā Ā def ahead(self, input_ids: Tensor, attn_mask: Tensor) -> Tensor:

Ā Ā Ā Ā Ā Ā Ā Ā # Convert enter token IDs to embeddings

Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = self.embed_tokens(input_ids)

Ā Ā Ā Ā Ā Ā Ā Ā # Course of by means of all transformer layers, then the ultimate norm layer

Ā Ā Ā Ā Ā Ā Ā Ā for layer in self.layers:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = layer(hidden_states, rope=self.rotary_emb, attn_mask=attn_mask)

Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = self.norm(hidden_states)

Ā Ā Ā Ā Ā Ā Ā Ā # Return the ultimate hidden states

Ā Ā Ā Ā Ā Ā Ā Ā return hidden_states

Ā 

Ā 

class LlamaForPretraining(nn.Module):

Ā Ā Ā Ā def __init__(self, config: LlamaConfig) -> None:

Ā Ā Ā Ā Ā Ā Ā Ā tremendous().__init__()

Ā Ā Ā Ā Ā Ā Ā Ā self.base_model = LlamaModel(config)

Ā Ā Ā Ā Ā Ā Ā Ā self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

Ā 

Ā Ā Ā Ā def reset_parameters(self):

Ā Ā Ā Ā Ā Ā Ā Ā self.base_model.reset_parameters()

Ā Ā Ā Ā Ā Ā Ā Ā self.lm_head.reset_parameters()

Ā 

Ā Ā Ā Ā def ahead(self, input_ids: Tensor, attn_mask: Tensor) -> Tensor:

Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = self.base_model(input_ids, attn_mask)

Ā Ā Ā Ā Ā Ā Ā Ā return self.lm_head(hidden_states)

Ā 

Ā 

def create_causal_mask(batch: Tensor, dtype: torch.dtype = torch.float32) -> Tensor:

Ā Ā Ā Ā “”“Create a causal masks for self-attention.

Ā 

Ā Ā Ā Ā Args:

Ā Ā Ā Ā Ā Ā Ā Ā batch: Batch of sequences, form (batch_size, seq_len)

Ā Ā Ā Ā Ā Ā Ā Ā dtype: Knowledge sort of the masks

Ā 

Ā Ā Ā Ā Returns:

Ā Ā Ā Ā Ā Ā Ā Ā Causal masks of form (seq_len, seq_len)

Ā Ā Ā Ā ““”

Ā Ā Ā Ā batch_size, seq_len = batch.form

Ā Ā Ā Ā masks = torch.full((seq_len, seq_len), float(“-inf”), gadget=batch.gadget, dtype=dtype)

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā .triu(diagonal=1)

Ā Ā Ā Ā return masks

Ā 

Ā 

def create_padding_mask(batch: Tensor, padding_token_id: int, dtype: torch.dtype = torch.float32) -> Tensor:

Ā Ā Ā Ā “”“Create a padding masks for a batch of sequences for self-attention.

Ā 

Ā Ā Ā Ā Args:

Ā Ā Ā Ā Ā Ā Ā Ā batch: Batch of sequences, form (batch_size, seq_len)

Ā Ā Ā Ā Ā Ā Ā Ā padding_token_id: ID of the padding token

Ā Ā Ā Ā Ā Ā Ā Ā dtype: Knowledge sort of the masks

Ā 

Ā Ā Ā Ā Returns:

Ā Ā Ā Ā Ā Ā Ā Ā Padding masks of form (batch_size, 1, seq_len, seq_len)

Ā Ā Ā Ā ““”

Ā Ā Ā Ā padded = torch.zeros_like(batch, gadget=batch.gadget, dtype=dtype)

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā .masked_fill(batch == padding_token_id, float(“-inf”))

Ā Ā Ā Ā masks = padded[:,:,None] + padded[:,None,:]

Ā Ā Ā Ā return masks[:, None, :, :]

Ā 

Ā 

# Generator perform to create padded sequences of fastened size

class PretrainingDataset(torch.utils.information.Dataset):

Ā Ā Ā Ā def __init__(self, dataset: datasets.Dataset, tokenizer: tokenizers.Tokenizer,

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā  seq_length: int):

Ā Ā Ā Ā Ā Ā Ā Ā self.dataset = dataset

Ā Ā Ā Ā Ā Ā Ā Ā self.tokenizer = tokenizer

Ā Ā Ā Ā Ā Ā Ā Ā self.seq_length = seq_length

Ā Ā Ā Ā Ā Ā Ā Ā self.bot = tokenizer.token_to_id(“[BOT]”)

Ā Ā Ā Ā Ā Ā Ā Ā self.eot = tokenizer.token_to_id(“[EOT]”)

Ā Ā Ā Ā Ā Ā Ā Ā self.pad = tokenizer.token_to_id(“[PAD]”)

Ā 

Ā Ā Ā Ā def __len__(self):

Ā Ā Ā Ā Ā Ā Ā Ā return len(self.dataset)

Ā 

Ā Ā Ā Ā def __getitem__(self, index: int) -> tuple[Tensor, Tensor]:

Ā Ā Ā Ā Ā Ā Ā Ā “”“Get a sequence of token ids from the dataset. [BOT] and [EOT] tokens

Ā Ā Ā Ā Ā Ā Ā Ā are added. Clipped and padded to the sequence size.

Ā Ā Ā Ā Ā Ā Ā Ā ““”

Ā Ā Ā Ā Ā Ā Ā Ā seq = self.dataset[index][“text”]

Ā Ā Ā Ā Ā Ā Ā Ā tokens: checklist[int] = [self.bot] + self.tokenizer.encode(seq).ids + [self.eot]

Ā Ā Ā Ā Ā Ā Ā Ā # pad to focus on sequence size

Ā Ā Ā Ā Ā Ā Ā Ā toklen = len(tokens)

Ā Ā Ā Ā Ā Ā Ā Ā if toklen < self.seq_length+1:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā pad_length = self.seq_length+1 – toklen

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā tokens += [self.pad] * pad_size

Ā Ā Ā Ā Ā Ā Ā Ā # return the sequence

Ā Ā Ā Ā Ā Ā Ā Ā x = torch.tensor(tokens[:self.seq_length], dtype=torch.int64)

Ā Ā Ā Ā Ā Ā Ā Ā y = torch.tensor(tokens[1:self.seq_length+1], dtype=torch.int64)

Ā Ā Ā Ā Ā Ā Ā Ā return x, y

Ā 

Ā 

def load_checkpoint(mannequin: nn.Module, optimizer: torch.optim.Optimizer, scheduler: lr_scheduler.SequentialLR) -> None:

Ā Ā Ā Ā dist.barrier()

Ā Ā Ā Ā model_state, optimizer_state = get_state_dict(

Ā Ā Ā Ā Ā Ā Ā Ā mannequin, optimizer, choices=StateDictOptions(full_state_dict=True, cpu_offload=cpu_offload),

Ā Ā Ā Ā )

Ā Ā Ā Ā load(

Ā Ā Ā Ā Ā Ā Ā Ā {“mannequin”: model_state, “optimizer”: optimizer_state},

Ā Ā Ā Ā Ā Ā Ā Ā checkpoint_id=“checkpoint-dist”,

Ā Ā Ā Ā )

Ā Ā Ā Ā set_state_dict(

Ā Ā Ā Ā Ā Ā Ā Ā mannequin, optimizer,

Ā Ā Ā Ā Ā Ā Ā Ā model_state_dict=model_state, optim_state_dict=optimizer_state,

Ā Ā Ā Ā Ā Ā Ā Ā choices=StateDictOptions(broadcast_from_rank0=True, full_state_dict=True, cpu_offload=cpu_offload),

Ā Ā Ā Ā )

Ā Ā Ā Ā scheduler.load_state_dict(

Ā Ā Ā Ā Ā Ā Ā Ā torch.load(“checkpoint-dist/lrscheduler.pt”, map_location=gadget),

Ā Ā Ā Ā )

Ā Ā Ā Ā dist.barrier()

Ā 

Ā 

def save_checkpoint(mannequin: nn.Module, optimizer: torch.optim.Optimizer, scheduler: lr_scheduler.SequentialLR) -> None:

Ā Ā Ā Ā dist.barrier()

Ā Ā Ā Ā model_state, optimizer_state = get_state_dict(

Ā Ā Ā Ā Ā Ā Ā Ā mannequin, optimizer, choices=StateDictOptions(full_state_dict=True, cpu_offload=cpu_offload),

Ā Ā Ā Ā )

Ā Ā Ā Ā save(

Ā Ā Ā Ā Ā Ā Ā Ā {“mannequin”: model_state, “optimizer”: optimizer_state},

Ā Ā Ā Ā Ā Ā Ā Ā checkpoint_id=“checkpoint-dist”,

Ā Ā Ā Ā )

Ā Ā Ā Ā if dist.get_rank() == 0:

Ā Ā Ā Ā Ā Ā Ā Ā torch.save(scheduler.state_dict(), “checkpoint-dist/lrscheduler.pt”)

Ā Ā Ā Ā dist.barrier()

Ā 

Ā 

# Load the tokenizer and dataset

tokenizer = tokenizers.Tokenizer.from_file(“bpe_50K.json”)

dataset = datasets.load_dataset(“HuggingFaceFW/fineweb”, “sample-10BT”, break up=“practice”)

Ā 

# Initialize the distributed atmosphere

dist.init_process_group(backend=“nccl”)

local_rank = int(os.environ[“LOCAL_RANK”])

gadget = torch.gadget(f“cuda:{local_rank}”)

rank = dist.get_rank()

world_size = dist.get_world_size()

print(f“World dimension {world_size}, rank {rank}, native rank {local_rank}. Utilizing {gadget}”)

Ā 

# Create pretraining mannequin on meta gadget, on all ranks

with torch.gadget(“meta”):

Ā Ā Ā Ā model_config = LlamaConfig()

Ā Ā Ā Ā mannequin = LlamaForPretraining(model_config)

Ā 

# Convert mannequin from meta gadget to FSDP2, should shard each element

cpu_offload = False

fsdp_kwargs = {

Ā Ā Ā Ā # elective: use combined precision coaching

Ā Ā Ā Ā “mp_policy”: MixedPrecisionPolicy(

Ā Ā Ā Ā Ā Ā Ā Ā param_dtype=torch.bfloat16,

Ā Ā Ā Ā Ā Ā Ā Ā reduce_dtype=torch.float32,

Ā Ā Ā Ā ),

Ā Ā Ā Ā # elective: CPU offloading

Ā Ā Ā Ā “offload_policy”: CPUOffloadPolicy() if cpu_offload else None,

Ā Ā Ā Ā # elective: discard all-gathered parameters after ahead go even on root modules

Ā Ā Ā Ā # “reshard_after_forward”: True,

}

for layer in mannequin.base_model.layers:

Ā Ā Ā Ā fully_shard(layer, **fsdp_kwargs)

fully_shard(mannequin.base_model, **fsdp_kwargs)

fully_shard(mannequin, **fsdp_kwargs)

mannequin.to_empty(gadget=“cpu” if cpu_offload else gadget)

mannequin.reset_parameters()

assert isinstance(mannequin, FSDPModule), f“Anticipated FSDPModule, acquired {sort(mannequin)}”

Ā 

# Set specific prefetching on fashions

# extra prefetching makes use of extra reminiscence, however enable extra overlap of computation and communication

num_prefetch = 1

if num_prefetch > 1:

Ā Ā Ā Ā modules = checklist(mannequin.base_model.layers)

Ā Ā Ā Ā for i, module in enumerate(modules):

Ā Ā Ā Ā Ā Ā Ā Ā if i == len(modules) – 1:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā break

Ā Ā Ā Ā Ā Ā Ā Ā module.set_modules_to_forward_prefetch(modules[i+1:i+num_prefetch+1])

Ā Ā Ā Ā for i, module in enumerate(modules):

Ā Ā Ā Ā Ā Ā Ā Ā if i == 0:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā proceed

Ā Ā Ā Ā Ā Ā Ā Ā module.set_modules_to_backward_prefetch(modules[max(0, i–num_prefetch):i])

Ā 

# Non-compulsory: Apply gradient checkpointing on a distributed mannequin (all ranks)

#wrap_policy = functools.partial(

#Ā Ā Ā Ā transformer_auto_wrap_policy,

#Ā Ā Ā Ā transformer_layer_cls={LlamaDecoderLayer, nn.Embedding},

#)

#apply_activation_checkpointing(

#Ā Ā Ā Ā mannequin,

#Ā Ā Ā Ā checkpoint_wrapper_fn=checkpoint_wrapper,

#Ā Ā Ā Ā auto_wrap_policy=wrap_policy,

#)

Ā 

# Coaching parameters

epochs = 3

learning_rate = 1e–3

batch_size = 64 // world_size

seq_length = 512

num_warmup_steps = 1000

PAD_TOKEN_ID = tokenizer.token_to_id(“[PAD]”)

mannequin.practice()

Ā 

# DataLoader, optimizer, scheduler, and loss perform

# Sampler is required to shard the dataset throughout world dimension

dataset = PretrainingDataset(dataset, tokenizer, seq_length)

sampler = DistributedSampler(dataset, shuffle=False, drop_last=True)

dataloader = torch.utils.information.DataLoader(

Ā Ā Ā Ā dataset,

Ā Ā Ā Ā sampler=sampler,

Ā Ā Ā Ā batch_size=batch_size,

Ā Ā Ā Ā pin_memory=True,Ā Ā # elective

Ā Ā Ā Ā shuffle=False,

Ā Ā Ā Ā num_workers=2,

Ā Ā Ā Ā prefetch_factor=2,

)

num_training_steps = len(dataloader) * epochs

Ā 

optimizer = torch.optim.AdamW(

Ā Ā Ā Ā mannequin.parameters(), lr=learning_rate, betas=(0.9, 0.99), eps=1e–8, weight_decay=0.1,

)

warmup_scheduler = lr_scheduler.LinearLR(

Ā Ā Ā Ā optimizer,

Ā Ā Ā Ā start_factor=0.1, end_factor=1.0, total_iters=num_warmup_steps,

)

cosine_scheduler = lr_scheduler.CosineAnnealingLR(

Ā Ā Ā Ā optimizer,

Ā Ā Ā Ā T_max=num_training_steps – num_warmup_steps,

Ā Ā Ā Ā eta_min=0,

)

scheduler = lr_scheduler.SequentialLR(

Ā Ā Ā Ā optimizer,

Ā Ā Ā Ā schedulers=[warmup_scheduler, cosine_scheduler],

Ā Ā Ā Ā milestones=[num_warmup_steps],

)

loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN_ID)

Ā 

# Non-compulsory: Compile the mannequin and loss perform

#mannequin = torch.compile(mannequin)

#loss_fn = torch.compile(loss_fn)

Ā 

# if checkpoint-dist dir exists, load the checkpoint to mannequin and optimizer

if os.path.exists(“checkpoint-dist”):

Ā Ā Ā Ā load_checkpoint(mannequin, optimizer, scheduler)

Ā 

# begin coaching

for epoch in vary(epochs):

Ā Ā Ā Ā pbar = tqdm.tqdm(dataloader, desc=f“Epoch {epoch+1}/{epochs}”)

Ā Ā Ā Ā for batch_id, batch in enumerate(pbar):

Ā Ā Ā Ā Ā Ā Ā Ā if batch_id % 1000 == 0:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā save_checkpoint(mannequin, optimizer, scheduler)

Ā Ā Ā Ā Ā Ā Ā Ā # Express prefetching earlier than sending any information to mannequin

Ā Ā Ā Ā Ā Ā Ā Ā mannequin.unshard()

Ā Ā Ā Ā Ā Ā Ā Ā # Get batched information, transfer from CPU to GPU

Ā Ā Ā Ā Ā Ā Ā Ā input_ids, target_ids = batch

Ā Ā Ā Ā Ā Ā Ā Ā input_ids = input_ids.to(gadget)

Ā Ā Ā Ā Ā Ā Ā Ā target_ids = target_ids.to(gadget)

Ā Ā Ā Ā Ā Ā Ā Ā # create consideration masks: causal masks + padding masks

Ā Ā Ā Ā Ā Ā Ā Ā attn_mask = create_causal_mask(input_ids) +

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā create_padding_mask(input_ids, PAD_TOKEN_ID)

Ā Ā Ā Ā Ā Ā Ā Ā # Extract output from mannequin

Ā Ā Ā Ā Ā Ā Ā Ā logits = mannequin(input_ids, attn_mask)

Ā Ā Ā Ā Ā Ā Ā Ā # Compute loss: cross-entropy between logits and goal, ignoring padding tokens

Ā Ā Ā Ā Ā Ā Ā Ā loss = loss_fn(logits.view(–1, logits.dimension(–1)), target_ids.view(–1))

Ā Ā Ā Ā Ā Ā Ā Ā # Backward with loss and gradient clipping by L2 norm to 1.0

Ā Ā Ā Ā Ā Ā Ā Ā # Optimizer and gradient clipping works on DTensor

Ā Ā Ā Ā Ā Ā Ā Ā optimizer.zero_grad(set_to_none=False if cpu_offload else True)

Ā Ā Ā Ā Ā Ā Ā Ā loss.backward()

Ā Ā Ā Ā Ā Ā Ā Ā # All-reduce fail if utilizing CPU offloading

Ā Ā Ā Ā Ā Ā Ā Ā if not cpu_offload:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā torch.nn.utils.clip_grad_norm_(mannequin.parameters(), 1.0)

Ā Ā Ā Ā Ā Ā Ā Ā optimizer.step()

Ā Ā Ā Ā Ā Ā Ā Ā scheduler.step()

Ā Ā Ā Ā Ā Ā Ā Ā pbar.set_postfix(loss=loss.merchandise())

Ā Ā Ā Ā Ā Ā Ā Ā pbar.replace(1)

Ā Ā Ā Ā pbar.shut()

Ā 

# Save the mannequin

save_checkpoint(mannequin, optimizer, scheduler)

Ā 

# Clear up the distributed atmosphere

dist.destroy_process_group()

Tags: DataFullyGPUsLargemodelmultipleParallelismShardedTrain
Admin

Admin

Next Post
Co-op RPGs as Good as Baldur’s Gate 3 That Supply Completely Completely different Experiences

Co-op RPGs as Good as Baldur's Gate 3 That Supply Completely Completely different Experiences

Leave a Reply Cancel reply

Your email address will not be published. Required fields are marked *

Recommended.

Tech Life – What to anticipate from tech in 2026

Tech Life – What to anticipate from tech in 2026

January 30, 2026
Right here’s what to look out for

Right here’s what to look out for

May 7, 2025

Trending.

The right way to Defeat Imagawa Tomeji

The right way to Defeat Imagawa Tomeji

September 28, 2025
Introducing Sophos Endpoint for Legacy Platforms – Sophos Information

Introducing Sophos Endpoint for Legacy Platforms – Sophos Information

August 28, 2025
How Voice-Enabled NSFW AI Video Turbines Are Altering Roleplay Endlessly

How Voice-Enabled NSFW AI Video Turbines Are Altering Roleplay Endlessly

June 10, 2025
AI-Assisted Menace Actor Compromises 600+ FortiGate Gadgets in 55 Nations

AI-Assisted Menace Actor Compromises 600+ FortiGate Gadgets in 55 Nations

February 23, 2026
Rogue Planet’ in Growth for Launch on iOS, Android, Change, and Steam in 2025 – TouchArcade

Rogue Planet’ in Growth for Launch on iOS, Android, Change, and Steam in 2025 – TouchArcade

June 19, 2025

AimactGrow

Welcome to AimactGrow, your ultimate source for all things technology! Our mission is to provide insightful, up-to-date content on the latest advancements in technology, coding, gaming, digital marketing, SEO, cybersecurity, and artificial intelligence (AI).

Categories

  • AI
  • Coding
  • Cybersecurity
  • Digital marketing
  • Gaming
  • SEO
  • Technology

Recent News

an AI agent for code safety — Google DeepMind

an AI agent for code safety — Google DeepMind

February 24, 2026
The Obtain: Radioactive rhinos, and the rise and rise of peptides

The Obtain: Radioactive rhinos, and the rise and rise of peptides

February 24, 2026
  • About Us
  • Privacy Policy
  • Disclaimer
  • Contact Us

Ā© 2025 https://blog.aimactgrow.com/ - All Rights Reserved

No Result
View All Result
  • Home
  • Technology
  • AI
  • SEO
  • Coding
  • Gaming
  • Cybersecurity
  • Digital marketing

Ā© 2025 https://blog.aimactgrow.com/ - All Rights Reserved