Hugging Face Transformers — Complete Guide

Introduction

Hugging Face Transformers is the de facto standard library for working with transformer models. This guide covers installation, model loading, inference, and fine-tuning with practical examples.

Installation
Loading Pre-trained Models
Using Pipeline API (Simplest)
Manual Model Loading
Model Hub
Inference with Different Tasks
Fine-tuning with Trainer API
Model Configuration
Quantization for Faster Inference
Batch Processing
Custom Model Architecture
Device Management
Production Deployment
Best Practices
Conclusion
FAQ

Installation

pip install transformers torch datasets accelerate

Loading Pre-trained Models

Using Pipeline API (Simplest)

from transformers import pipeline

# Text generation
generator = pipeline("text-generation", model="distilgpt2")
result = generator("Once upon a time", max_length=100)
print(result[0]["generated_text"])

# Classification
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
result = classifier("I love this product!")
print(result)  # [{'label': 'POSITIVE', 'score': 0.9998}]

# Question answering
qa = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
context = "Machine learning is a subset of artificial intelligence."
question = "What is machine learning?"
result = qa(question=question, context=context)
print(result)

Manual Model Loading

from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer and model
model_name = "mistralai/Mistral-7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Tokenize input
inputs = tokenizer("Hello, how are you?", return_tensors="pt")

# Generate
outputs = model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0]))

Model Hub

Hugging Face Model Hub hosts 100,000+ pre-trained models.

from huggingface_hub import list_models

# Find models by task
models = list_models(
    filter="text-generation",
    sort="downloads",
    direction=-1,
    limit=10
)

for model in models:
    print(f"{model.id}: {model.downloads}")

Inference with Different Tasks

# Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
text = "Machine learning is a field of artificial intelligence..."
summary = summarizer(text, max_length=50)

# Named Entity Recognition
ner = pipeline("ner", model="dslim/bert-base-NER")
result = ner("My name is John and I work at Google.")

# Sentiment Analysis
sentiment = pipeline("sentiment-analysis")
result = sentiment("I absolutely love this!")

# Fill Mask (BERT-style)
unmasker = pipeline("fill-mask", model="bert-base-uncased")
result = unmasker("The capital of France is [MASK].")

Fine-tuning with Trainer API

from transformers import Trainer, TrainingArguments
from datasets import load_dataset

# Load dataset
dataset = load_dataset("imdb")

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

# Train
trainer.train()

# Save
model.save_pretrained("./fine-tuned-model")

Model Configuration

from transformers import AutoConfig

# Load config
config = AutoConfig.from_pretrained("bert-base-uncased")

# Modify config
config.num_hidden_layers = 6  # Smaller model
config.hidden_size = 256

# Create model from modified config
from transformers import AutoModel
model = AutoModel.from_config(config)

Quantization for Faster Inference

from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# 8-bit quantization
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b",
    quantization_config=bnb_config,
    device_map="auto"
)

# 4-bit quantization (QLoRA)
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-70b",
    quantization_config=bnb_config,
    device_map="auto"
)

Batch Processing

# Process multiple inputs efficiently
texts = [
    "Machine learning is great",
    "I love natural language processing",
    "Transformers are powerful"
]

# Batch encoding
inputs = tokenizer(
    texts,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

# Batch inference
with torch.no_grad():
    outputs = model(**inputs)

# Get embeddings for all inputs at once
embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] token

Custom Model Architecture

from transformers import PreTrainedModel, PretrainedConfig
import torch.nn as nn

class CustomConfig(PretrainedConfig):
    model_type = "custom"

    def __init__(self, hidden_size=768, num_layers=12, **kwargs):
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        super().__init__(**kwargs)

class CustomModel(PreTrainedModel):
    config_class = CustomConfig

    def __init__(self, config):
        super().__init__(config)
        self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=config.hidden_size,
                nhead=12,
                batch_first=True
            )
            for _ in range(config.num_layers)
        ])

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        for layer in self.layers:
            x = layer(x)
        return x

Device Management

# Automatically use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Or use device_map for automatic distribution
model = AutoModelForCausalLM.from_pretrained(
    "model-name",
    device_map="auto"
)

# Check device
print(next(model.parameters()).device)

Production Deployment

pip install fastapi uvicorn

from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline

app = FastAPI()
classifier = pipeline("text-classification")

class TextRequest(BaseModel):
    text: str

@app.post("/classify")
async def classify(request: TextRequest):
    result = classifier(request.text)
    return result

Run with:

uvicorn app:app --reload

Best Practices

Use pipeline API for quick prototyping
Use Auto classes for flexibility*
Monitor memory with large models
Use accelerate for multi-GPU training
Save and version your models

# Save model and tokenizer
model.save_pretrained("./my-model")
tokenizer.save_pretrained("./my-model")

# Load them back
model = AutoModelForCausalLM.from_pretrained("./my-model")
tokenizer = AutoTokenizer.from_pretrained("./my-model")

Conclusion

Hugging Face Transformers abstracts away complexity, making state-of-the-art NLP accessible to all developers. Whether using pipeline API or advanced training, it provides tools for every use case.

FAQ

Q: Should I use pipeline API or manual model loading? A: Use pipeline for prototyping, manual loading for production and custom workflows.

Q: How do I handle models larger than VRAM? A: Use device_map="auto" for automatic memory-efficient distribution, or quantization for smaller model sizes.

Q: Can I combine multiple models? A: Yes, use ensemble techniques or create custom architectures that combine multiple pre-trained models.