print("n" + "="*70 + "n4. Variable-length packed batch — no padding wasten" + "="*70)
seqlens = [37, 120, 8, 200]
whole = sum(seqlens)
H, Okay = 8, 64
q = torch.randn(1, whole, H, Okay, gadget=gadget, dtype=torch.float16)
ok = torch.randn(1, whole, H, Okay, gadget=gadget, dtype=torch.float16)
v = torch.randn(1, whole, H, Okay, gadget=gadget, dtype=torch.float16)
attempt:
bias = ab.BlockDiagonalMask.from_seqlens(seqlens)
out_packed = xops.memory_efficient_attention(q, ok, v, attn_bias=bias)
s0 = seqlens[0]
ref0 = vanilla_attention(q[:, :s0], ok[:, :s0], v[:, :s0]).half()
print("packed form :", tuple(out_packed.form), "(all", whole, "tokens, no pad)")
print("segment-0 max diff : {:.2e}".format((out_packed[:, :s0] - ref0).abs().max().merchandise()))
cbias = ab.BlockDiagonalCausalMask.from_seqlens(seqlens)
_ = xops.memory_efficient_attention(q, ok, v, attn_bias=cbias)
print("-> additionally did a packed CAUSAL move. That is how vLLM-style engines")
print(" batch requests of various lengths with zero padding overhead.")
splits = bias.break up(out_packed)
print("recovered segments :", [tuple(t.shape) for t in splits])
besides Exception as e:
print("BlockDiagonalMask path skipped on this model/backend:", repr(e))
print("n" + "="*70 + "n5. Grouped-query consideration (5-D BMGHK format)n" + "="*70)
B, M, Okay = 2, 256, 64
n_q_heads, n_kv_heads = 8, 2
G, Hq = n_kv_heads, n_q_heads // n_kv_heads
attempt:
qg = torch.randn(B, M, G, Hq, Okay, gadget=gadget, dtype=torch.float16)
kg = torch.randn(B, M, G, 1, Okay, gadget=gadget, dtype=torch.float16)
vg = torch.randn(B, M, G, 1, Okay, gadget=gadget, dtype=torch.float16)
out_gqa = xops.memory_efficient_attention(qg, kg, vg)
print("GQA output form :", tuple(out_gqa.form), "= [B, M, G, Hq, K]")
print(f"-> {n_q_heads} question heads, solely {n_kv_heads} KV heads: smaller KV-cache,")
print(" which is precisely what Llama-/Mistral-class fashions use at inference.")
besides Exception as e:
print("GQA 5-D path skipped on this model/backend:", repr(e))
