Loading...
Loading...
Running and fine-tuning LLMs on Apple Silicon with MLX. Use when working with models locally on Mac, converting Hugging Face models to MLX format, fine-tuning with LoRA/QLoRA on Apple Silicon, or serving models via HTTP API.
npx skill4agent add itsmostafa/llm-engineering-skills mlx| Aspect | PyTorch on Mac | MLX |
|---|---|---|
| Memory | Separate CPU/GPU copies | Unified memory, no copies |
| Optimization | Generic Metal backend | Apple Silicon native |
| Model loading | Slower, more memory | Lazy loading, efficient |
| Quantization | Limited support | Built-in 4/8-bit |
pip install mlx-lmfrom mlx_lm import load, generate
# Load model (from HF hub or local path)
model, tokenizer = load("mlx-community/Llama-3.2-3B-Instruct-4bit")
# Generate text
response = generate(
model,
tokenizer,
prompt="Explain quantum computing in simple terms:",
max_tokens=256,
temp=0.7,
)
print(response)from mlx_lm import load, stream_generate
model, tokenizer = load("mlx-community/Mistral-7B-Instruct-v0.3-4bit")
prompt = "Write a haiku about programming:"
for response in stream_generate(model, tokenizer, prompt, max_tokens=100):
print(response.text, end="", flush=True)
print()from mlx_lm import load, batch_generate
model, tokenizer = load("mlx-community/Qwen2.5-7B-Instruct-4bit")
prompts = [
"What is machine learning?",
"Explain neural networks:",
"Define deep learning:",
]
responses = batch_generate(
model,
tokenizer,
prompts,
max_tokens=100,
)
for prompt, response in zip(prompts, responses):
print(f"Q: {prompt}\nA: {response}\n")# Basic generation
mlx_lm.generate --model mlx-community/Llama-3.2-3B-Instruct-4bit \
--prompt "Explain recursion:" \
--max-tokens 256
# With sampling parameters
mlx_lm.generate --model mlx-community/Mistral-7B-Instruct-v0.3-4bit \
--prompt "Write a poem about AI:" \
--temp 0.8 \
--top-p 0.95# Start chat REPL (context preserved between turns)
mlx_lm.chat --model mlx-community/Llama-3.2-3B-Instruct-4bitfrom mlx_lm import load, generate
model, tokenizer = load("mlx-community/Llama-3.2-3B-Instruct-4bit")
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What's the capital of France?"},
]
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
response = generate(model, tokenizer, prompt=prompt, max_tokens=256)
print(response)# Convert with 4-bit quantization
mlx_lm.convert --hf-path meta-llama/Llama-3.2-3B-Instruct \
-q # Quantize to 4-bit
# With specific quantization
mlx_lm.convert --hf-path mistralai/Mistral-7B-Instruct-v0.3 \
-q \
--q-bits 8 \
--q-group-size 64
# Upload to Hugging Face Hub
mlx_lm.convert --hf-path meta-llama/Llama-3.2-1B-Instruct \
-q \
--upload-repo your-username/Llama-3.2-1B-Instruct-4bit-mlxfrom mlx_lm import convert
convert(
hf_path="meta-llama/Llama-3.2-3B-Instruct",
mlx_path="./llama-3.2-3b-mlx",
quantize=True,
q_bits=4,
q_group_size=64,
)| Option | Default | Description |
|---|---|---|
| 4 | Quantization bits (4 or 8) |
| 64 | Group size for quantization |
| float16 | Data type for non-quantized weights |
| Method | Best For | Command |
|---|---|---|
| Basic | Quick conversion | |
| DWQ | Quality-preserving | |
| AWQ | Activation-aware | |
| Dynamic | Per-layer precision | |
| GPTQ | Established method | |
# 4-bit quantization during conversion
mlx_lm.convert --hf-path mistralai/Mistral-7B-v0.3 -q
# 8-bit for higher quality
mlx_lm.convert --hf-path mistralai/Mistral-7B-v0.3 -q --q-bits 8reference/quantization.md# Prepare training data (JSONL format)
# {"text": "Your training text here"}
# or
# {"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
# Fine-tune with LoRA
mlx_lm.lora --model mlx-community/Llama-3.2-3B-Instruct-4bit \
--train \
--data ./data \
--iters 1000
# Generate with adapter
mlx_lm.generate --model mlx-community/Llama-3.2-3B-Instruct-4bit \
--adapter-path ./adapters \
--prompt "Your prompt here"# Merge LoRA weights into base model
mlx_lm.fuse --model mlx-community/Llama-3.2-3B-Instruct-4bit \
--adapter-path ./adapters \
--save-path ./fused-model
# Or export to GGUF
mlx_lm.fuse --model mlx-community/Llama-3.2-3B-Instruct-4bit \
--adapter-path ./adapters \
--export-ggufreference/fine-tuning.md# Start server
mlx_lm.server --model mlx-community/Llama-3.2-3B-Instruct-4bit --port 8080
# Use with OpenAI client
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "default",
"messages": [{"role": "user", "content": "Hello!"}],
"max_tokens": 256
}'from openai import OpenAI
client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed")
response = client.chat.completions.create(
model="default",
messages=[{"role": "user", "content": "Explain MLX in one sentence."}],
max_tokens=100,
)
print(response.choices[0].message.content)mlx-communitystream_generatemlx_lm.cache_promptbatch_generatetokenizer.apply_chat_template()reference/quantization.mdfine-tuning.md