Loading...
Loading...
Provides guidance for automatically evolving and optimizing AI agents across any domain using LLM-driven evolution algorithms. Use when building self-improving agents, optimizing agent prompts and skills against benchmarks, or implementing automated agent evaluation loops.
npx skill4agent add orchestra-research/ai-research-skills evolving-ai-agentspip install a-evolve # Core
pip install a-evolve[anthropic] # With Claude support
pip install a-evolve[all] # All providersimport agent_evolve as ae
evolver = ae.Evolver(agent="swe", benchmark="swe-verified")
results = evolver.run(cycles=10)
print(f"Final score: {results.final_score}")my-agent/
├── manifest.yaml # Metadata + entrypoint
├── prompts/
│ ├── system.md # Main system prompt (evolved)
│ └── fragments/ # Modular prompt pieces
├── skills/
│ └── skill-name/
│ └── SKILL.md # Reusable procedure with frontmatter
├── memory/
│ ├── episodic.jsonl # Lessons from failures
│ └── semantic.jsonl # General knowledge
├── tools/
│ ├── registry.yaml # Tool manifest
│ └── tool_name.py # Tool implementations
└── evolution/ # Managed by engine (metrics, history)# 1. Agent — implements solve()
class MyAgent(ae.BaseAgent):
def solve(self, task: ae.Task) -> ae.Trajectory:
# Domain-specific solving logic
return ae.Trajectory(task_id=task.id, output=result, steps=steps)
# 2. Benchmark — implements get_tasks() and evaluate()
class MyBenchmark(ae.BenchmarkAdapter):
def get_tasks(self, split="train", limit=None) -> list[ae.Task]:
return [ae.Task(id="1", input="...")]
def evaluate(self, task: ae.Task, trajectory: ae.Trajectory) -> ae.Feedback:
return ae.Feedback(success=True, score=0.95, detail="Passed")
# 3. Engine — implements step()
class MyEngine(ae.EvolutionEngine):
def step(self, workspace, observations, history, trial):
# Mutate workspace based on observations
return ae.StepResult(mutated=True, summary="Updated prompts")BaseAgent.solve()TrajectoryBenchmarkAdapterget_tasks()evaluate()manifest.yamlprompts/system.mdgit init && git add -A && git commit -m "init"import agent_evolve as ae
# Configure evolution parameters
config = ae.EvolveConfig(
batch_size=10, # Tasks per solve round
max_cycles=20, # Maximum evolution iterations
evolve_prompts=True, # Mutate system prompt
evolve_skills=True, # Discover and refine skills
evolve_memory=True, # Build episodic memory
evolver_model="us.anthropic.claude-opus-4-6-v1",
)
# Point to your agent workspace and benchmark
evolver = ae.Evolver(
agent="./my-agent-workspace",
benchmark="swe-verified", # Or custom BenchmarkAdapter instance
config=config,
)
# Run evolution
results = evolver.run(cycles=10)
# Inspect results
print(f"Cycles completed: {results.cycles_completed}")
print(f"Final score: {results.final_score}")
print(f"Converged: {results.converged}")
for cycle_num, score in enumerate(results.score_history):
print(f" Cycle {cycle_num + 1}: {score:.3f}")cd my-agent-workspace
git log --oneline # See evo-1, evo-2, ... tags
git diff evo-1 evo-10 # Compare first and last evolution
cat prompts/system.md # Read evolved prompt
ls skills/ # See discovered skillsimport agent_evolve as ae
class CodeReviewBenchmark(ae.BenchmarkAdapter):
"""Evaluate agents on code review quality."""
def get_tasks(self, split="train", limit=None):
tasks = load_review_dataset(split)
if limit:
tasks = tasks[:limit]
return [
ae.Task(id=t["id"], input=t["diff"], metadata={"expected": t["comments"]})
for t in tasks
]
def evaluate(self, task, trajectory):
expected = task.metadata["expected"]
actual = trajectory.output
precision, recall = compute_review_metrics(expected, actual)
f1 = 2 * precision * recall / (precision + recall + 1e-9)
return ae.Feedback(
success=f1 > 0.7,
score=f1,
detail=f"P={precision:.2f} R={recall:.2f} F1={f1:.2f}",
)
# Use with any agent
evolver = ae.Evolver(agent="./my-agent", benchmark=CodeReviewBenchmark())
results = evolver.run(cycles=5)import agent_evolve as ae
class RuleBasedEngine(ae.EvolutionEngine):
def step(self, workspace, observations, history, trial):
failures = [o for o in observations if not o.feedback.success]
if not failures:
return ae.StepResult(mutated=False, summary="No failures to address")
# Analyze failure patterns
error_types = categorize_errors(failures)
prompt = workspace.read_prompt()
# Append learned rules to prompt
new_rules = generate_rules(error_types)
workspace.write_prompt(prompt + "\n" + new_rules)
return ae.StepResult(
mutated=True,
summary=f"Added {len(new_rules)} rules from {len(failures)} failures",
)
evolver = ae.Evolver(
agent="./my-agent",
benchmark="my-benchmark",
engine=RuleBasedEngine(),
)| Agent | Domain | Model | Key Feature |
|---|---|---|---|
| SWE-bench | Claude Opus 4.6 | Verify-fix loop, skill proposals |
| Terminal-Bench | Claude Sonnet 4 | Concurrent timeout, env discovery |
| MCP-Atlas | Claude Opus 4.6 | MCP server integration |
| Name | Domain | Metric |
|---|---|---|
| Code patching | Pass rate |
| Tool calling | Accuracy |
| Shell tasks | Pass rate |
| Multi-step procedures | Accuracy |
| Interactive games | RHAE score |
| Algorithm | Strategy | Best For |
|---|---|---|
| A-Evolve/SkillForge | LLM-driven workspace mutation | General-purpose |
| Guided Synthesis | Memory-first, curated skills | Skill discovery |
| Adaptive Evolution | Reward tracking, filtered observations | Fine-grained control |
| Adaptive Skill | Skill-centric refinement | Skill-heavy domains |
ae.EvolveConfig(
batch_size=10, # Tasks per solve round
max_cycles=20, # Max evolution iterations
holdout_ratio=0.2, # Test set split for gating
evolve_prompts=True, # Mutate system prompts
evolve_skills=True, # Discover/refine skills
evolve_memory=True, # Build episodic memory
evolve_tools=False, # Mutate tool implementations
trajectory_only=False, # Hide scores from evolver
evolver_model="us.anthropic.claude-opus-4-6-v1",
evolver_max_tokens=16384,
egl_threshold=0.05, # Convergence epsilon
egl_window=3, # Cycles for plateau detection
)egl_thresholdegl_window---
name: verify-edge-cases
description: "TRIGGER when: checking boundary conditions. DO NOT TRIGGER: for happy-path tests."
---
## Pattern
Test all falsy-but-valid values: 0, False, "", [], {}
## Process
1. List all input boundaries
2. Run each against the implementation
3. Check both output AND side effectsskills/batch_sizetrajectory_only=Falseevolver.run()Evolver(agent="seed-name")evolver_modelANTHROPIC_API_KEYreload_from_fs()reload_from_fs()BaseAgent"swe""terminal""mcp"egl_threshold=0.05egl_window=3prompts/system.mdskills/trajectory_only=Falsebatch_size=10holdout_ratio=0.2git diff evo-1 evo-Nfeedback.detailconverged=Trueegl_windowegl_threshold