Mistral AI — Open Source LLM Guide

Sanjeev SharmaSanjeev Sharma
4 min read

Advertisement

Introduction

Mistral is one of the most efficient open-source LLMs, delivering excellent quality with minimal resources. This guide covers everything from setup to production deployment.

Mistral Model Variants

Mistral-7B: 7B parameters, very efficient
- Best for: Speed-sensitive applications
- Inference speed: 500+ tokens/sec on GPU

Mistral-8x7B-MoE: Mixture of Experts
- Best for: High quality while staying efficient
- Inference speed: 100+ tokens/sec on GPU

Mistral-Large: Highest quality (licensed)
- Best for: Complex reasoning tasks
- Via: API only

Quick Start with Ollama

# Simplest method
ollama pull mistral
ollama run mistral

# Interactive chat
>>> Tell me about machine learning

Using Hugging Face

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_id = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Generate
prompt = "What is machine learning?"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    temperature=0.7
)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

Structured Output

import json

def generate_structured_json(topic: str):
    """Generate JSON output using Mistral."""
    prompt = f"""Generate a JSON object with information about {topic}.
Include fields: name, description, use_cases (list), pros (list), cons (list).

Return only valid JSON:"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.3  # Lower temperature for consistency
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract JSON
    try:
        json_start = response.find('{')
        json_end = response.rfind('}') + 1
        json_str = response[json_start:json_end]
        return json.loads(json_str)
    except json.JSONDecodeError:
        return None

# Usage
result = generate_structured_json("machine learning")
print(json.dumps(result, indent=2))

Chat Template

def chat_with_mistral(messages: list) -> str:
    """Chat using Mistral's chat template."""
    # Format: [INST] user message [/INST] assistant response </s>

    chat_text = ""
    for msg in messages:
        if msg["role"] == "user":
            chat_text += f"[INST] {msg['content']} [/INST] "
        else:
            chat_text += f"{msg['content']} </s>"

    inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=256)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Usage
messages = [
    {"role": "user", "content": "What is quantum computing?"}
]
response = chat_with_mistral(messages)
print(response)

Quantization for Efficiency

# 8-bit quantization
pip install bitsandbytes
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True
)

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    quantization_config=bnb_config,
    device_map="auto"
)

# 4-bit quantization (even smaller)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    quantization_config=bnb_config,
    device_map="auto"
)

Using Mistral API

pip install mistral-client
from mistralai.client import MistralClient
from mistralai.models.chat_message import ChatMessage

client = MistralClient(api_key="your-api-key")

messages = [
    ChatMessage(role="user", content="What is AI?")
]

response = client.chat(
    model="mistral-medium",
    messages=messages,
    max_tokens=256
)

print(response.choices[0].message.content)

Integration with LangChain

from langchain.llms.mistralai import MistralAI
from langchain_core.prompts import ChatPromptTemplate

# Via API
llm = MistralAI(api_key="your-key")

# Or via Ollama
from langchain.llms import Ollama
llm = Ollama(model="mistral")

# Use in chain
prompt = ChatPromptTemplate.from_template(
    "Explain {topic} in simple terms"
)

chain = prompt | llm

result = chain.invoke({"topic": "neural networks"})
print(result)

Fine-tuning Mistral

from peft import get_peft_model, LoraConfig

# Load Mistral
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    load_in_8bit=True,
    device_map="auto"
)

# Apply LoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Fine-tune with Trainer
# (see previous fine-tuning guide)

Benchmarks

import time

def benchmark_mistral(prompt: str):
    """Benchmark Mistral performance."""
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    start = time.time()
    outputs = model.generate(**inputs, max_new_tokens=100)
    elapsed = time.time() - start

    tokens = len(outputs[0])
    speed = tokens / elapsed

    print(f"Speed: {speed:.1f} tokens/sec")
    print(f"Time: {elapsed:.2f}s")

benchmark_mistral("Machine learning is")

# Expected:
# CPU (8-bit): 50-100 tokens/sec
# GPU: 500+ tokens/sec

Deployment

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel

app = FastAPI()

class GenerateRequest(BaseModel):
    prompt: str
    max_tokens: int = 256

@app.post("/generate")
async def generate(request: GenerateRequest):
    try:
        inputs = tokenizer(
            request.prompt,
            return_tensors="pt"
        ).to("cuda")

        outputs = model.generate(
            **inputs,
            max_new_tokens=request.max_tokens
        )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return {"response": response}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# Run: uvicorn app:app --reload

Conclusion

Mistral is an excellent open-source choice for efficient, capable LLM deployments. With quantization, it runs on consumer hardware while delivering high-quality results.

FAQ

Q: Should I use Mistral or Llama? A: Mistral is faster and more efficient. Llama has slightly better quality. For most applications, Mistral is the better choice.

Q: Is Mistral suitable for production? A: Yes, use Ollama for simple deployments or containerize for scale.

Q: Can I fine-tune Mistral? A: Yes, with LoRA for efficiency or full fine-tuning with sufficient VRAM.

Advertisement

Sanjeev Sharma

Written by

Sanjeev Sharma

Full Stack Engineer · E-mopro