Mistral AI — Open Source LLM Guide

Introduction

Mistral is one of the most efficient open-source LLMs, delivering excellent quality with minimal resources. This guide covers everything from setup to production deployment.

Mistral Model Variants
Quick Start with Ollama
Using Hugging Face
Structured Output
Chat Template
Quantization for Efficiency
Using Mistral API
Integration with LangChain
Fine-tuning Mistral
Benchmarks
Deployment
Conclusion
FAQ

Mistral Model Variants

Mistral-7B: 7B parameters, very efficient
- Best for: Speed-sensitive applications
- Inference speed: 500+ tokens/sec on GPU

Mistral-8x7B-MoE: Mixture of Experts
- Best for: High quality while staying efficient
- Inference speed: 100+ tokens/sec on GPU

Mistral-Large: Highest quality (licensed)
- Best for: Complex reasoning tasks
- Via: API only

Quick Start with Ollama

# Simplest method
ollama pull mistral
ollama run mistral

# Interactive chat
>>> Tell me about machine learning

Using Hugging Face

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_id = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Generate
prompt = "What is machine learning?"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    temperature=0.7
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Structured Output

import json

def generate_structured_json(topic: str):
    """Generate JSON output using Mistral."""
    prompt = f"""Generate a JSON object with information about {topic}.
Include fields: name, description, use_cases (list), pros (list), cons (list).

Return only valid JSON:"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.3  # Lower temperature for consistency
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract JSON
    try:
        json_start = response.find('{')
        json_end = response.rfind('}') + 1
        json_str = response[json_start:json_end]
        return json.loads(json_str)
    except json.JSONDecodeError:
        return None

# Usage
result = generate_structured_json("machine learning")
print(json.dumps(result, indent=2))

Chat Template

def chat_with_mistral(messages: list) -> str:
    """Chat using Mistral's chat template."""
    # Format: [INST] user message [/INST] assistant response </s>

    chat_text = ""
    for msg in messages:
        if msg["role"] == "user":
            chat_text += f"[INST] {msg['content']} [/INST] "
        else:
            chat_text += f"{msg['content']} </s>"

    inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=256)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Usage
messages = [
    {"role": "user", "content": "What is quantum computing?"}
]
response = chat_with_mistral(messages)
print(response)

Quantization for Efficiency

# 8-bit quantization
pip install bitsandbytes

from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True
)

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    quantization_config=bnb_config,
    device_map="auto"
)

# 4-bit quantization (even smaller)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    quantization_config=bnb_config,
    device_map="auto"
)

Using Mistral API

pip install mistral-client

from mistralai.client import MistralClient
from mistralai.models.chat_message import ChatMessage

client = MistralClient(api_key="your-api-key")

messages = [
    ChatMessage(role="user", content="What is AI?")
]

response = client.chat(
    model="mistral-medium",
    messages=messages,
    max_tokens=256
)

print(response.choices[0].message.content)

Integration with LangChain

from langchain.llms.mistralai import MistralAI
from langchain_core.prompts import ChatPromptTemplate

# Via API
llm = MistralAI(api_key="your-key")

# Or via Ollama
from langchain.llms import Ollama
llm = Ollama(model="mistral")

# Use in chain
prompt = ChatPromptTemplate.from_template(
    "Explain {topic} in simple terms"
)

chain = prompt | llm

result = chain.invoke({"topic": "neural networks"})
print(result)

Fine-tuning Mistral

from peft import get_peft_model, LoraConfig

# Load Mistral
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    load_in_8bit=True,
    device_map="auto"
)

# Apply LoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Fine-tune with Trainer
# (see previous fine-tuning guide)

Benchmarks

import time

def benchmark_mistral(prompt: str):
    """Benchmark Mistral performance."""
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    start = time.time()
    outputs = model.generate(**inputs, max_new_tokens=100)
    elapsed = time.time() - start

    tokens = len(outputs[0])
    speed = tokens / elapsed

    print(f"Speed: {speed:.1f} tokens/sec")
    print(f"Time: {elapsed:.2f}s")

benchmark_mistral("Machine learning is")

# Expected:
# CPU (8-bit): 50-100 tokens/sec
# GPU: 500+ tokens/sec

Deployment

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel

app = FastAPI()

class GenerateRequest(BaseModel):
    prompt: str
    max_tokens: int = 256

@app.post("/generate")
async def generate(request: GenerateRequest):
    try:
        inputs = tokenizer(
            request.prompt,
            return_tensors="pt"
        ).to("cuda")

        outputs = model.generate(
            **inputs,
            max_new_tokens=request.max_tokens
        )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return {"response": response}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# Run: uvicorn app:app --reload

Conclusion

Mistral is an excellent open-source choice for efficient, capable LLM deployments. With quantization, it runs on consumer hardware while delivering high-quality results.

FAQ

Q: Should I use Mistral or Llama? A: Mistral is faster and more efficient. Llama has slightly better quality. For most applications, Mistral is the better choice.

Q: Is Mistral suitable for production? A: Yes, use Ollama for simple deployments or containerize for scale.

Q: Can I fine-tune Mistral? A: Yes, with LoRA for efficiency or full fine-tuning with sufficient VRAM.