A Coding Implementation on Microsoft’s Phi-4-Mini for Quantized Inference Reasoning Device Use RAG and LoRA Nice-Tuning

import subprocess, sys, os, shutil, glob


def pip_install(args):
   subprocess.run([sys.executable, "-m", "pip", "install", "-q", *args],
                  examine=True)


pip_install(["huggingface_hub>=0.26,<1.0"])


pip_install([
   "-U",
   "transformers>=4.49,<4.57",
   "accelerate>=0.33.0",
   "bitsandbytes>=0.43.0",
   "peft>=0.11.0",
   "datasets>=2.20.0,<3.0",
   "sentence-transformers>=3.0.0,<4.0",
   "faiss-cpu",
])


for p in glob.glob(os.path.expanduser(
       "~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4*")):
   shutil.rmtree(p, ignore_errors=True)


for _m in checklist(sys.modules):
   if _m.startswith(("transformers", "huggingface_hub", "tokenizers",
                     "speed up", "peft", "datasets",
                     "sentence_transformers")):
       del sys.modules[_m]


import json, re, textwrap, warnings, torch
warnings.filterwarnings("ignore")


from transformers import (
   AutoModelForCausalLM,
   AutoTokenizer,
   BitsAndBytesConfig,
   TextStreamer,
   TrainingArguments,
   Coach,
   DataCollatorForLanguageModeling,
)
import transformers
print(f"Utilizing transformers {transformers.__version__}")


PHI_MODEL_ID = "microsoft/Phi-4-mini-instruct"


assert torch.cuda.is_available(), (
   "No GPU detected. In Colab: Runtime > Change runtime kind > T4 GPU."
)
print(f"GPU detected: {torch.cuda.get_device_name(0)}")
print(f"Loading Phi mannequin (native phi3 arch, no distant code): {PHI_MODEL_ID}n")


bnb_cfg = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_compute_dtype=torch.bfloat16,
   bnb_4bit_use_double_quant=True,
)


phi_tokenizer = AutoTokenizer.from_pretrained(PHI_MODEL_ID)
if phi_tokenizer.pad_token_id is None:
   phi_tokenizer.pad_token = phi_tokenizer.eos_token


phi_model = AutoModelForCausalLM.from_pretrained(
   PHI_MODEL_ID,
   quantization_config=bnb_cfg,
   device_map="auto",
   torch_dtype=torch.bfloat16,
)
phi_model.config.use_cache = True


print(f"n✓ Phi-4-mini loaded in 4-bit. "
     f"GPU reminiscence: {torch.cuda.memory_allocated()/1e9:.2f} GB")
print(f"  Structure: {phi_model.config.model_type}   "
     f"(utilizing built-in {kind(phi_model).__name__})")
print(f"  Parameters: ~{sum(p.numel() for p in phi_model.parameters())/1e9:.2f}B")


def ask_phi(messages, *, instruments=None, max_new_tokens=512,
           temperature=0.3, stream=False):
   """Single entry level for all Phi-4-mini inference calls beneath."""
   prompt_ids = phi_tokenizer.apply_chat_template(
       messages,
       instruments=instruments,
       add_generation_prompt=True,
       return_tensors="pt",
   ).to(phi_model.gadget)


   streamer = (TextStreamer(phi_tokenizer, skip_prompt=True,
                            skip_special_tokens=True)
               if stream else None)


   with torch.inference_mode():
       out = phi_model.generate(
           prompt_ids,
           max_new_tokens=max_new_tokens,
           do_sample=temperature > 0,
           temperature=max(temperature, 1e-5),
           top_p=0.9,
           pad_token_id=phi_tokenizer.pad_token_id,
           eos_token_id=phi_tokenizer.eos_token_id,
           streamer=streamer,
       )
   return phi_tokenizer.decode(
       out[0][prompt_ids.shape[1]:], skip_special_tokens=True
   ).strip()


def banner(title):
   print("n" + "=" * 78 + f"n  {title}n" + "=" * 78)

Supply hyperlink

What's Hot

Delivering Private Service at Scale

IBM Launches Compact z17 and LinuxONE Methods to Tackle Knowledge Middle Area and Value Constraints

Partron and Syntiant to Develop On-Sensor AI Options for Healthcare, Robotics and Automotive Purposes

A Coding Implementation on Microsoft’s Phi-4-Mini for Quantized Inference Reasoning Device Use RAG and LoRA Nice-Tuning

How one can Construct Reminiscence-Environment friendly Transformers with xFormers Utilizing Packed Sequences, GQA, ALiBi, SwiGLU, and Causal Consideration

A Coding Implementation on MONAI for Finish-to-Finish 3D Spleen Segmentation Utilizing UNet on Medical CT Volumes

Pace Up Transformer Coaching Utilizing NVIDIA Apex (FusedAdam, FusedLayerNorm) and Native torch.amp

Delivering Private Service at Scale

IBM Launches Compact z17 and LinuxONE Methods to Tackle Knowledge Middle Area and Value Constraints

Partron and Syntiant to Develop On-Sensor AI Options for Healthcare, Robotics and Automotive Purposes

Ceva Wins Landmark AI Licensing Cope with Main U.S. Software program and AI Platform Firm

Delivering Private Service at Scale

IBM Launches Compact z17 and LinuxONE Methods to Tackle Knowledge Middle Area and Value Constraints

Partron and Syntiant to Develop On-Sensor AI Options for Healthcare, Robotics and Automotive Purposes

Ceva Wins Landmark AI Licensing Cope with Main U.S. Software program and AI Platform Firm

Our Picks

Delivering Private Service at Scale

IBM Launches Compact z17 and LinuxONE Methods to Tackle Knowledge Middle Area and Value Constraints

Partron and Syntiant to Develop On-Sensor AI Options for Healthcare, Robotics and Automotive Purposes

Trending

Ceva Wins Landmark AI Licensing Cope with Main U.S. Software program and AI Platform Firm

airSlate Appoints Former Adobe Govt Claude Alexandre as President

Ascent Cloud Launches New Geopointe UX with “Enhanced Management Panel” (ECP)

Subscribe to Updates

What's Hot

A Coding Implementation on Microsoft’s Phi-4-Mini for Quantized Inference Reasoning Device Use RAG and LoRA Nice-Tuning

Related Posts