Hugging Face Transformers — Complete Guide
Advertisement
Introduction
Hugging Face Transformers is the de facto standard library for working with transformer models. This guide covers installation, model loading, inference, and fine-tuning with practical examples.
- Installation
- Loading Pre-trained Models
- Using Pipeline API (Simplest)
- Manual Model Loading
- Model Hub
- Inference with Different Tasks
- Fine-tuning with Trainer API
- Model Configuration
- Quantization for Faster Inference
- Batch Processing
- Custom Model Architecture
- Device Management
- Production Deployment
- Best Practices
- Conclusion
- FAQ
Installation
pip install transformers torch datasets accelerate
Loading Pre-trained Models
Using Pipeline API (Simplest)
from transformers import pipeline
# Text generation
generator = pipeline("text-generation", model="distilgpt2")
result = generator("Once upon a time", max_length=100)
print(result[0]["generated_text"])
# Classification
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
result = classifier("I love this product!")
print(result) # [{'label': 'POSITIVE', 'score': 0.9998}]
# Question answering
qa = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
context = "Machine learning is a subset of artificial intelligence."
question = "What is machine learning?"
result = qa(question=question, context=context)
print(result)
Manual Model Loading
from transformers import AutoTokenizer, AutoModelForCausalLM
# Load tokenizer and model
model_name = "mistralai/Mistral-7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Tokenize input
inputs = tokenizer("Hello, how are you?", return_tensors="pt")
# Generate
outputs = model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0]))
Model Hub
Hugging Face Model Hub hosts 100,000+ pre-trained models.
from huggingface_hub import list_models
# Find models by task
models = list_models(
filter="text-generation",
sort="downloads",
direction=-1,
limit=10
)
for model in models:
print(f"{model.id}: {model.downloads}")
Inference with Different Tasks
# Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
text = "Machine learning is a field of artificial intelligence..."
summary = summarizer(text, max_length=50)
# Named Entity Recognition
ner = pipeline("ner", model="dslim/bert-base-NER")
result = ner("My name is John and I work at Google.")
# Sentiment Analysis
sentiment = pipeline("sentiment-analysis")
result = sentiment("I absolutely love this!")
# Fill Mask (BERT-style)
unmasker = pipeline("fill-mask", model="bert-base-uncased")
result = unmasker("The capital of France is [MASK].")
Fine-tuning with Trainer API
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
# Load dataset
dataset = load_dataset("imdb")
# Tokenize
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"]
)
# Train
trainer.train()
# Save
model.save_pretrained("./fine-tuned-model")
Model Configuration
from transformers import AutoConfig
# Load config
config = AutoConfig.from_pretrained("bert-base-uncased")
# Modify config
config.num_hidden_layers = 6 # Smaller model
config.hidden_size = 256
# Create model from modified config
from transformers import AutoModel
model = AutoModel.from_config(config)
Quantization for Faster Inference
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
# 8-bit quantization
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b",
quantization_config=bnb_config,
device_map="auto"
)
# 4-bit quantization (QLoRA)
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-70b",
quantization_config=bnb_config,
device_map="auto"
)
Batch Processing
# Process multiple inputs efficiently
texts = [
"Machine learning is great",
"I love natural language processing",
"Transformers are powerful"
]
# Batch encoding
inputs = tokenizer(
texts,
padding=True,
truncation=True,
return_tensors="pt"
)
# Batch inference
with torch.no_grad():
outputs = model(**inputs)
# Get embeddings for all inputs at once
embeddings = outputs.last_hidden_state[:, 0, :] # [CLS] token
Custom Model Architecture
from transformers import PreTrainedModel, PretrainedConfig
import torch.nn as nn
class CustomConfig(PretrainedConfig):
model_type = "custom"
def __init__(self, hidden_size=768, num_layers=12, **kwargs):
self.hidden_size = hidden_size
self.num_layers = num_layers
super().__init__(**kwargs)
class CustomModel(PreTrainedModel):
config_class = CustomConfig
def __init__(self, config):
super().__init__(config)
self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)
self.layers = nn.ModuleList([
nn.TransformerEncoderLayer(
d_model=config.hidden_size,
nhead=12,
batch_first=True
)
for _ in range(config.num_layers)
])
def forward(self, input_ids):
x = self.embedding(input_ids)
for layer in self.layers:
x = layer(x)
return x
Device Management
# Automatically use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
# Or use device_map for automatic distribution
model = AutoModelForCausalLM.from_pretrained(
"model-name",
device_map="auto"
)
# Check device
print(next(model.parameters()).device)
Production Deployment
pip install fastapi uvicorn
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline
app = FastAPI()
classifier = pipeline("text-classification")
class TextRequest(BaseModel):
text: str
@app.post("/classify")
async def classify(request: TextRequest):
result = classifier(request.text)
return result
Run with:
uvicorn app:app --reload
Best Practices
- Use pipeline API for quick prototyping
- Use Auto classes for flexibility*
- Monitor memory with large models
- Use accelerate for multi-GPU training
- Save and version your models
# Save model and tokenizer
model.save_pretrained("./my-model")
tokenizer.save_pretrained("./my-model")
# Load them back
model = AutoModelForCausalLM.from_pretrained("./my-model")
tokenizer = AutoTokenizer.from_pretrained("./my-model")
Conclusion
Hugging Face Transformers abstracts away complexity, making state-of-the-art NLP accessible to all developers. Whether using pipeline API or advanced training, it provides tools for every use case.
FAQ
Q: Should I use pipeline API or manual model loading? A: Use pipeline for prototyping, manual loading for production and custom workflows.
Q: How do I handle models larger than VRAM? A: Use device_map="auto" for automatic memory-efficient distribution, or quantization for smaller model sizes.
Q: Can I combine multiple models? A: Yes, use ensemble techniques or create custom architectures that combine multiple pre-trained models.
Advertisement