Fine-tuning LLMs — Complete Guide with OpenAI
Advertisement
Introduction
Fine-tuning allows you to adapt pre-trained LLMs to specific tasks or domains. This guide covers fine-tuning with OpenAI, preparing training data, and evaluating results.
- What is Fine-tuning?
- Preparing Training Data
- Data Format for OpenAI
- Data Quality Guidelines
- Validation with OpenAI
- Fine-tuning with OpenAI
- Monitoring Training
- Using Fine-tuned Model
- Cost Estimation
- Fine-tuning Open Source Models
- Using Hugging Face Transformers
- Evaluation
- Best Practices
- LoRA: Efficient Fine-tuning
- Conclusion
- FAQ
What is Fine-tuning?
Fine-tuning adapts a pre-trained model to your specific use case by training it on labeled examples. It's more efficient than pre-training and works with smaller datasets.
Use cases:
- Domain-specific language (legal, medical)
- Specific output formats (JSON, CSV)
- Brand voice or style
- Specialized tasks
Preparing Training Data
Data Format for OpenAI
# JSONL format (one JSON object per line)
# Format: {"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
import json
training_data = [
{
"messages": [
{"role": "system", "content": "You are a helpful customer service AI."},
{"role": "user", "content": "How do I reset my password?"},
{"role": "assistant", "content": "To reset your password, click 'Forgot Password' on the login page..."}
]
},
{
"messages": [
{"role": "system", "content": "You are a helpful customer service AI."},
{"role": "user", "content": "What are your business hours?"},
{"role": "assistant", "content": "We're open Monday-Friday 9AM-5PM EST..."}
]
}
]
# Save to JSONL file
with open("training_data.jsonl", "w") as f:
for item in training_data:
f.write(json.dumps(item) + "\n")
Data Quality Guidelines
# Good example: diverse, relevant data
good_examples = [
{
"messages": [
{"role": "system", "content": "You are a Python expert."},
{"role": "user", "content": "How do I sort a list?"},
{"role": "assistant", "content": "Use the sorted() function: sorted([3, 1, 2]) returns [1, 2, 3]"}
]
}
]
# Poor example: too similar, insufficient context
poor_examples = [
{"messages": [
{"role": "user", "content": "Hi"},
{"role": "assistant", "content": "Hi"}
]}
]
Validation with OpenAI
from openai import OpenAI
client = OpenAI()
# Validate training data
with open("training_data.jsonl", "rb") as f:
result = client.files.create(
file=f,
purpose="fine-tune"
)
file_id = result.id
print(f"File uploaded: {file_id}")
Fine-tuning with OpenAI
# Create fine-tuning job
job = client.fine_tuning.jobs.create(
training_file=file_id,
model="gpt-3.5-turbo",
hyperparameters={
"n_epochs": 3,
"batch_size": 16,
"learning_rate_multiplier": 1.0
}
)
print(f"Fine-tuning job created: {job.id}")
Monitoring Training
# Check job status
job = client.fine_tuning.jobs.retrieve(job.id)
print(f"Status: {job.status}")
print(f"Training samples: {job.training_file}")
# List all fine-tuning jobs
jobs = client.fine_tuning.jobs.list(limit=10)
for job in jobs.data:
print(f"{job.id}: {job.status}")
Using Fine-tuned Model
# Once training completes, use the fine-tuned model
response = client.chat.completions.create(
model=job.fine_tuned_model, # e.g., "ft-abc123xyz"
messages=[
{"role": "system", "content": "You are a customer service AI."},
{"role": "user", "content": "How do I track my order?"}
],
temperature=0.7
)
print(response.choices[0].message.content)
Cost Estimation
# OpenAI fine-tuning costs
def estimate_finetuning_cost(training_tokens: int, epochs: int) -> float:
"""
Estimate fine-tuning cost for gpt-3.5-turbo.
Training: $0.03 per 1K tokens
"""
total_tokens = training_tokens * epochs
cost = (total_tokens / 1000) * 0.03
return cost
# Example: 100k tokens * 3 epochs
cost = estimate_finetuning_cost(100000, 3)
print(f"Estimated cost: ${cost:.2f}") # $9.00
Fine-tuning Open Source Models
Using Hugging Face Transformers
pip install transformers datasets torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B")
# Load dataset
dataset = load_dataset("your_dataset.py")
# Tokenize
def tokenize_function(examples):
return tokenizer(
examples["text"],
max_length=512,
truncation=True,
padding="max_length"
)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=4,
save_steps=100,
learning_rate=2e-5
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"]
)
# Fine-tune
trainer.train()
# Save model
model.save_pretrained("./fine-tuned-model")
Evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def evaluate_finetuned_model(model, test_data):
"""Evaluate fine-tuned model."""
predictions = []
labels = []
for example in test_data:
response = client.chat.completions.create(
model=model,
messages=example["messages"],
temperature=0
)
predictions.append(response.choices[0].message.content)
labels.append(example["expected_output"])
# Compute metrics
accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, predictions, average="weighted"
)
return {
"accuracy": accuracy,
"precision": precision,
"recall": recall,
"f1": f1
}
Best Practices
# 1. Use diverse, high-quality examples
# 2. Include edge cases and negative examples
# 3. Validate on held-out test set
# 4. Start with small number of epochs
# 5. Monitor for overfitting
# Example: proper data split
import random
all_data = [...] # Your training data
random.shuffle(all_data)
train_size = int(0.8 * len(all_data))
train_data = all_data[:train_size]
val_data = all_data[train_size:]
print(f"Training: {len(train_data)}, Validation: {len(val_data)}")
LoRA: Efficient Fine-tuning
For large models, LoRA is more efficient:
pip install peft bitsandbytes
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForCausalLM
# Load model
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B", load_in_8bit=True)
# Configure LoRA
peft_config = LoraConfig(
r=8,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.1,
bias="none"
)
# Apply LoRA
model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())
# Train with PEFT model (same as before)
Conclusion
Fine-tuning adapts LLMs to your specific needs. Whether using OpenAI's managed service or fine-tuning open-source models, the key is quality training data and proper evaluation.
FAQ
Q: How much training data do I need? A: OpenAI recommends at least 50-100 examples per task, but more data (100+) improves quality. Quality matters more than quantity.
Q: Can I fine-tune GPT-4? A: Currently, OpenAI only supports fine-tuning gpt-3.5-turbo. GPT-4 fine-tuning may be available in future.
Q: Should I fine-tune or use prompt engineering? A: Start with prompt engineering. Fine-tune if you need consistent, domain-specific behavior or custom output formats.
Advertisement