Hugging Face Transformers — Complete Guide

Sanjeev SharmaSanjeev Sharma
4 min read

Advertisement

Introduction

Hugging Face Transformers is the de facto standard library for working with transformer models. This guide covers installation, model loading, inference, and fine-tuning with practical examples.

Installation

pip install transformers torch datasets accelerate

Loading Pre-trained Models

Using Pipeline API (Simplest)

from transformers import pipeline

# Text generation
generator = pipeline("text-generation", model="distilgpt2")
result = generator("Once upon a time", max_length=100)
print(result[0]["generated_text"])

# Classification
classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
result = classifier("I love this product!")
print(result)  # [{'label': 'POSITIVE', 'score': 0.9998}]

# Question answering
qa = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")
context = "Machine learning is a subset of artificial intelligence."
question = "What is machine learning?"
result = qa(question=question, context=context)
print(result)

Manual Model Loading

from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer and model
model_name = "mistralai/Mistral-7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Tokenize input
inputs = tokenizer("Hello, how are you?", return_tensors="pt")

# Generate
outputs = model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0]))

Model Hub

Hugging Face Model Hub hosts 100,000+ pre-trained models.

from huggingface_hub import list_models

# Find models by task
models = list_models(
    filter="text-generation",
    sort="downloads",
    direction=-1,
    limit=10
)

for model in models:
    print(f"{model.id}: {model.downloads}")

Inference with Different Tasks

# Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
text = "Machine learning is a field of artificial intelligence..."
summary = summarizer(text, max_length=50)

# Named Entity Recognition
ner = pipeline("ner", model="dslim/bert-base-NER")
result = ner("My name is John and I work at Google.")

# Sentiment Analysis
sentiment = pipeline("sentiment-analysis")
result = sentiment("I absolutely love this!")

# Fill Mask (BERT-style)
unmasker = pipeline("fill-mask", model="bert-base-uncased")
result = unmasker("The capital of France is [MASK].")

Fine-tuning with Trainer API

from transformers import Trainer, TrainingArguments
from datasets import load_dataset

# Load dataset
dataset = load_dataset("imdb")

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

# Train
trainer.train()

# Save
model.save_pretrained("./fine-tuned-model")

Model Configuration

from transformers import AutoConfig

# Load config
config = AutoConfig.from_pretrained("bert-base-uncased")

# Modify config
config.num_hidden_layers = 6  # Smaller model
config.hidden_size = 256

# Create model from modified config
from transformers import AutoModel
model = AutoModel.from_config(config)

Quantization for Faster Inference

from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# 8-bit quantization
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b",
    quantization_config=bnb_config,
    device_map="auto"
)

# 4-bit quantization (QLoRA)
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-70b",
    quantization_config=bnb_config,
    device_map="auto"
)

Batch Processing

# Process multiple inputs efficiently
texts = [
    "Machine learning is great",
    "I love natural language processing",
    "Transformers are powerful"
]

# Batch encoding
inputs = tokenizer(
    texts,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

# Batch inference
with torch.no_grad():
    outputs = model(**inputs)

# Get embeddings for all inputs at once
embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] token

Custom Model Architecture

from transformers import PreTrainedModel, PretrainedConfig
import torch.nn as nn

class CustomConfig(PretrainedConfig):
    model_type = "custom"

    def __init__(self, hidden_size=768, num_layers=12, **kwargs):
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        super().__init__(**kwargs)

class CustomModel(PreTrainedModel):
    config_class = CustomConfig

    def __init__(self, config):
        super().__init__(config)
        self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=config.hidden_size,
                nhead=12,
                batch_first=True
            )
            for _ in range(config.num_layers)
        ])

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        for layer in self.layers:
            x = layer(x)
        return x

Device Management

# Automatically use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Or use device_map for automatic distribution
model = AutoModelForCausalLM.from_pretrained(
    "model-name",
    device_map="auto"
)

# Check device
print(next(model.parameters()).device)

Production Deployment

pip install fastapi uvicorn
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline

app = FastAPI()
classifier = pipeline("text-classification")

class TextRequest(BaseModel):
    text: str

@app.post("/classify")
async def classify(request: TextRequest):
    result = classifier(request.text)
    return result

Run with:

uvicorn app:app --reload

Best Practices

  1. Use pipeline API for quick prototyping
  2. Use Auto classes for flexibility*
  3. Monitor memory with large models
  4. Use accelerate for multi-GPU training
  5. Save and version your models
# Save model and tokenizer
model.save_pretrained("./my-model")
tokenizer.save_pretrained("./my-model")

# Load them back
model = AutoModelForCausalLM.from_pretrained("./my-model")
tokenizer = AutoTokenizer.from_pretrained("./my-model")

Conclusion

Hugging Face Transformers abstracts away complexity, making state-of-the-art NLP accessible to all developers. Whether using pipeline API or advanced training, it provides tools for every use case.

FAQ

Q: Should I use pipeline API or manual model loading? A: Use pipeline for prototyping, manual loading for production and custom workflows.

Q: How do I handle models larger than VRAM? A: Use device_map="auto" for automatic memory-efficient distribution, or quantization for smaller model sizes.

Q: Can I combine multiple models? A: Yes, use ensemble techniques or create custom architectures that combine multiple pre-trained models.

Advertisement

Sanjeev Sharma

Written by

Sanjeev Sharma

Full Stack Engineer · E-mopro