Mistral AI — Open Source LLM Guide
Advertisement
Introduction
Mistral is one of the most efficient open-source LLMs, delivering excellent quality with minimal resources. This guide covers everything from setup to production deployment.
- Mistral Model Variants
- Quick Start with Ollama
- Using Hugging Face
- Structured Output
- Chat Template
- Quantization for Efficiency
- Using Mistral API
- Integration with LangChain
- Fine-tuning Mistral
- Benchmarks
- Deployment
- Conclusion
- FAQ
Mistral Model Variants
Mistral-7B: 7B parameters, very efficient
- Best for: Speed-sensitive applications
- Inference speed: 500+ tokens/sec on GPU
Mistral-8x7B-MoE: Mixture of Experts
- Best for: High quality while staying efficient
- Inference speed: 100+ tokens/sec on GPU
Mistral-Large: Highest quality (licensed)
- Best for: Complex reasoning tasks
- Via: API only
Quick Start with Ollama
# Simplest method
ollama pull mistral
ollama run mistral
# Interactive chat
>>> Tell me about machine learning
Using Hugging Face
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16,
device_map="auto"
)
# Generate
prompt = "What is machine learning?"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.7
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)
Structured Output
import json
def generate_structured_json(topic: str):
"""Generate JSON output using Mistral."""
prompt = f"""Generate a JSON object with information about {topic}.
Include fields: name, description, use_cases (list), pros (list), cons (list).
Return only valid JSON:"""
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.3 # Lower temperature for consistency
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract JSON
try:
json_start = response.find('{')
json_end = response.rfind('}') + 1
json_str = response[json_start:json_end]
return json.loads(json_str)
except json.JSONDecodeError:
return None
# Usage
result = generate_structured_json("machine learning")
print(json.dumps(result, indent=2))
Chat Template
def chat_with_mistral(messages: list) -> str:
"""Chat using Mistral's chat template."""
# Format: [INST] user message [/INST] assistant response </s>
chat_text = ""
for msg in messages:
if msg["role"] == "user":
chat_text += f"[INST] {msg['content']} [/INST] "
else:
chat_text += f"{msg['content']} </s>"
inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=256)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# Usage
messages = [
{"role": "user", "content": "What is quantum computing?"}
]
response = chat_with_mistral(messages)
print(response)
Quantization for Efficiency
# 8-bit quantization
pip install bitsandbytes
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
load_in_8bit=True
)
model = AutoModelForCausalLM.from_pretrained(
"mistralai/Mistral-7B-Instruct-v0.2",
quantization_config=bnb_config,
device_map="auto"
)
# 4-bit quantization (even smaller)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
"mistralai/Mistral-7B-Instruct-v0.2",
quantization_config=bnb_config,
device_map="auto"
)
Using Mistral API
pip install mistral-client
from mistralai.client import MistralClient
from mistralai.models.chat_message import ChatMessage
client = MistralClient(api_key="your-api-key")
messages = [
ChatMessage(role="user", content="What is AI?")
]
response = client.chat(
model="mistral-medium",
messages=messages,
max_tokens=256
)
print(response.choices[0].message.content)
Integration with LangChain
from langchain.llms.mistralai import MistralAI
from langchain_core.prompts import ChatPromptTemplate
# Via API
llm = MistralAI(api_key="your-key")
# Or via Ollama
from langchain.llms import Ollama
llm = Ollama(model="mistral")
# Use in chain
prompt = ChatPromptTemplate.from_template(
"Explain {topic} in simple terms"
)
chain = prompt | llm
result = chain.invoke({"topic": "neural networks"})
print(result)
Fine-tuning Mistral
from peft import get_peft_model, LoraConfig
# Load Mistral
model = AutoModelForCausalLM.from_pretrained(
"mistralai/Mistral-7B-Instruct-v0.2",
load_in_8bit=True,
device_map="auto"
)
# Apply LoRA
peft_config = LoraConfig(
r=8,
lora_alpha=32,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.1
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# Fine-tune with Trainer
# (see previous fine-tuning guide)
Benchmarks
import time
def benchmark_mistral(prompt: str):
"""Benchmark Mistral performance."""
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
start = time.time()
outputs = model.generate(**inputs, max_new_tokens=100)
elapsed = time.time() - start
tokens = len(outputs[0])
speed = tokens / elapsed
print(f"Speed: {speed:.1f} tokens/sec")
print(f"Time: {elapsed:.2f}s")
benchmark_mistral("Machine learning is")
# Expected:
# CPU (8-bit): 50-100 tokens/sec
# GPU: 500+ tokens/sec
Deployment
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
app = FastAPI()
class GenerateRequest(BaseModel):
prompt: str
max_tokens: int = 256
@app.post("/generate")
async def generate(request: GenerateRequest):
try:
inputs = tokenizer(
request.prompt,
return_tensors="pt"
).to("cuda")
outputs = model.generate(
**inputs,
max_new_tokens=request.max_tokens
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return {"response": response}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# Run: uvicorn app:app --reload
Conclusion
Mistral is an excellent open-source choice for efficient, capable LLM deployments. With quantization, it runs on consumer hardware while delivering high-quality results.
FAQ
Q: Should I use Mistral or Llama? A: Mistral is faster and more efficient. Llama has slightly better quality. For most applications, Mistral is the better choice.
Q: Is Mistral suitable for production? A: Yes, use Ollama for simple deployments or containerize for scale.
Q: Can I fine-tune Mistral? A: Yes, with LoRA for efficiency or full fine-tuning with sufficient VRAM.
Advertisement