Pace Up Transformer Coaching Utilizing NVIDIA Apex (FusedAdam, FusedLayerNorm) and Native torch.amp

print("n### SECTION D: end-to-end Transformer (vanilla fp32 vs Apex fused + AMP) ###")
VOCAB, D, NHEAD, LAYERS, SEQ, BATCH, STEPS = 2000, 256, 4, 4, 128, 32, 60
class Block(torch.nn.Module):
   def __init__(self, d, nhead, norm_cls):
       tremendous().__init__()
       self.attn = torch.nn.MultiheadAttention(d, nhead, batch_first=True)
       self.ff = torch.nn.Sequential(torch.nn.Linear(d, 4 * d), torch.nn.GELU(),
                                     torch.nn.Linear(4 * d, d))
       self.n1, self.n2 = norm_cls(d), norm_cls(d)
   def ahead(self, x):
       h = self.n1(x); x = x + self.attn(h, h, h, need_weights=False)[0]
       return x + self.ff(self.n2(x))
class TinyTransformer(torch.nn.Module):
   def __init__(self, norm_cls):
       tremendous().__init__()
       self.emb = torch.nn.Embedding(VOCAB, D)
       self.blocks = torch.nn.ModuleList([Block(D, NHEAD, norm_cls) for _ in range(LAYERS)])
       self.norm = norm_cls(D)
       self.head = torch.nn.Linear(D, VOCAB)
   def ahead(self, idx):
       x = self.emb(idx)
       for b in self.blocks:
           x = b(x)
       return self.head(self.norm(x))
g = torch.Generator(system="cpu").manual_seed(0)
information = torch.randint(0, VOCAB, (BATCH, SEQ + 1), generator=g).to(DEV)
inp, tgt = information[:, :-1], information[:, 1:]
lossfn = torch.nn.CrossEntropyLoss()
def run_training(use_apex):
   torch.manual_seed(0)
   norm_cls = (FusedLayerNorm if (use_apex and HAS_FLN and APEX_OK) else torch.nn.LayerNorm)
   mannequin = TinyTransformer(norm_cls).to(DEV)
   if use_apex and HAS_AMP_C and APEX_OK:
       optimizer = FusedAdam(mannequin.parameters(), lr=3e-4)
   else:
       optimizer = torch.optim.AdamW(mannequin.parameters(), lr=3e-4)
   scaler = torch.amp.GradScaler("cuda", enabled=use_apex)
   def one_step():
       optimizer.zero_grad(set_to_none=True)
       with torch.amp.autocast("cuda", dtype=torch.float16, enabled=use_apex):
           logits = mannequin(inp)
           loss = lossfn(logits.reshape(-1, VOCAB), tgt.reshape(-1))
       scaler.scale(loss).backward()
       scaler.step(optimizer)
       scaler.replace()
       return loss
   for _ in vary(5):
       one_step()
   torch.cuda.synchronize()
   t0 = time.perf_counter()
   for _ in vary(STEPS):
       loss = one_step()
   torch.cuda.synchronize()
   dt = time.perf_counter() - t0
   return loss.merchandise(), (STEPS * BATCH * SEQ) / dt, dt
loss_v, tps_v, dt_v = run_training(use_apex=False)
print(f"  vanilla (fp32, nn.LayerNorm, AdamW)        : "
     f"{dt_v:5.2f}s  | {tps_v:9.0f} tok/s | closing loss {loss_v:.3f}")
if APEX_OK and (HAS_AMP_C or HAS_FLN):
   loss_a, tps_a, dt_a = run_training(use_apex=True)
   print(f"  apex   (fp16, FusedLayerNorm, FusedAdam)   : "
         f"{dt_a:5.2f}s  | {tps_a:9.0f} tok/s | closing loss {loss_a:.3f}")
   print(f"  ----> speedup: {tps_a / tps_v:0.2f}x throughput")
else:
   print("  apex path SKIPPED (no fused kernels constructed)")
print("n" + "=" * 78)
print("DONE. Key takeaways:")
print("  - FusedAdam/FusedLayerNorm/FusedRMSNorm are the still-relevant Apex items;")
print("    speedups develop with mannequin measurement & parameter rely (tiny demo understates it).")
print("  - apex.amp is deprecated -> want torch.amp.autocast + torch.amp.GradScaler.")
print("  - FusedAdam composes cleanly with native torch.amp (Part D).")
print("  - On actual workloads, additionally attempt a bigger mannequin and bf16 autocast (no scaler wanted).")
print("=" * 78)

Supply hyperlink

What's Hot

A 7-Precept Framework to Construct, Run, and Evolve with AI

Why Autonomous Brokers Wrestle with Single-Mannequin Pipelines and How AI.cc Gives the Resolution

AI search visibility report finds 53% of B2B corporations absent from ChatGPT and AI Search

Pace Up Transformer Coaching Utilizing NVIDIA Apex (FusedAdam, FusedLayerNorm) and Native torch.amp

How one can Construct Reminiscence-Environment friendly Transformers with xFormers Utilizing Packed Sequences, GQA, ALiBi, SwiGLU, and Causal Consideration

A Coding Implementation on MONAI for Finish-to-Finish 3D Spleen Segmentation Utilizing UNet on Medical CT Volumes

Nous Analysis Proposes Lighthouse Consideration: A Coaching-Solely Choice-Based mostly Hierarchical Consideration That Delivers 1.4–1.7× Pretraining Speedup at Lengthy Context

A 7-Precept Framework to Construct, Run, and Evolve with AI

Why Autonomous Brokers Wrestle with Single-Mannequin Pipelines and How AI.cc Gives the Resolution

AI search visibility report finds 53% of B2B corporations absent from ChatGPT and AI Search

GAPVelocity AI Unveils VELO for PowerBuilder, Bringing Agentic AI Modernization to Enterprise Legacy Programs

A 7-Precept Framework to Construct, Run, and Evolve with AI

Why Autonomous Brokers Wrestle with Single-Mannequin Pipelines and How AI.cc Gives the Resolution

AI search visibility report finds 53% of B2B corporations absent from ChatGPT and AI Search

GAPVelocity AI Unveils VELO for PowerBuilder, Bringing Agentic AI Modernization to Enterprise Legacy Programs

Our Picks

A 7-Precept Framework to Construct, Run, and Evolve with AI

Why Autonomous Brokers Wrestle with Single-Mannequin Pipelines and How AI.cc Gives the Resolution

AI search visibility report finds 53% of B2B corporations absent from ChatGPT and AI Search

Trending

GAPVelocity AI Unveils VELO for PowerBuilder, Bringing Agentic AI Modernization to Enterprise Legacy Programs

Money Cow Advertising Releases New Steerage on Getting ready for the AI-Powered Way forward for Search

For Multi-Location Manufacturers, 20% of Their Areas are Invisible on AI Search

Subscribe to Updates

What's Hot

Pace Up Transformer Coaching Utilizing NVIDIA Apex (FusedAdam, FusedLayerNorm) and Native torch.amp

Related Posts