Loading...
Loading...
Strategies for managing LLM context windows effectively in AI agents. Use when building agents that handle long conversations, multi-step tasks, tool orchestration, or need to maintain coherence across extended interactions.
npx skill4agent add itsmostafa/llm-engineering-skills context-engineeringEffective Context = Relevant Information / Total Tokens| Aspect | Details |
|---|---|
| Mechanism | Sliding window over conversation history |
| Pros | Deterministic, zero latency, preserves recent context verbatim |
| Cons | Abrupt loss of long-range context, "amnesia" effect |
| Best for | Independent tasks, short interactions, predictable workflows |
def trim_context(messages: list, keep_last_n: int = 10) -> list:
"""Keep system message + last N turns."""
system_msgs = [m for m in messages if m["role"] == "system"]
other_msgs = [m for m in messages if m["role"] != "system"]
return system_msgs + other_msgs[-keep_last_n:]| Aspect | Details |
|---|---|
| Mechanism | LLM generates summary of older context |
| Pros | Retains long-range memory, smoother UX, scalable |
| Cons | Summarization bias risk, added latency, potential compounding errors |
| Best for | Complex multi-step tasks, long-horizon interactions |
SUMMARIZATION_PROMPT = """Summarize the conversation so far, preserving:
1. Key decisions made
2. Important context established
3. Current task state and goals
4. Any constraints or preferences expressed
Be concise but complete. Output as structured markdown."""
async def summarize_context(messages: list, model) -> str:
"""Generate a summary of conversation history."""
conversation_text = format_messages_for_summary(messages)
response = await model.generate(
system=SUMMARIZATION_PROMPT,
user=conversation_text
)
return response.contentclass HybridContextManager:
def __init__(
self,
keep_recent: int = 5, # Recent turns to keep verbatim
summary_threshold: int = 20, # When to trigger summarization
):
self.keep_recent = keep_recent
self.summary_threshold = summary_threshold
self.running_summary = ""
def process(self, messages: list) -> list:
if len(messages) < self.summary_threshold:
return messages
# Summarize older messages
old_messages = messages[:-self.keep_recent]
self.running_summary = summarize(old_messages, self.running_summary)
# Return summary + recent messages
return [
{"role": "system", "content": f"Previous context:\n{self.running_summary}"},
*messages[-self.keep_recent:]
]# Role
You are [specific role] that [primary function].
# Capabilities
- [Capability 1 with scope]
- [Capability 2 with scope]
# Constraints
- [Hard constraint]
- [Preference]
# Output Format
[Specific format requirements]# Anti-pattern: Loading everything upfront
context = load_all_user_data() # Large, mostly unused
context += load_all_documents() # Even larger
# Better: Just-in-time retrieval
tools = [
Tool(
name="get_user_preference",
description="Get specific user preference by key",
# Only fetches what's needed when asked
),
Tool(
name="search_documents",
description="Search documents by query",
# Returns relevant subset
),
]# Well-designed tool
def search_codebase(query: str, max_results: int = 5) -> str:
"""Search codebase for relevant code snippets.
Args:
query: Natural language description of what to find
max_results: Maximum snippets to return (default 5)
Returns:
Formatted code snippets with file paths and line numbers,
or 'No results found' if nothing matches.
"""
results = perform_search(query, limit=max_results)
if not results:
return "No results found for query."
return format_results(results) # Concise, structured outputasync def compaction_loop(agent, messages, task):
while not task.complete:
# Process next step
response = await agent.run(messages)
messages.append(response)
# Compact when approaching limit
if estimate_tokens(messages) > TOKEN_LIMIT * 0.8:
summary = await summarize_context(messages[:-3])
messages = [
{"role": "system", "content": agent.system_prompt},
{"role": "assistant", "content": f"Summary of progress:\n{summary}"},
*messages[-3:] # Keep recent context
]
return messagesclass NoteTakingAgent:
def __init__(self):
self.notes = {} # Key-value store outside context
async def run(self, messages):
tools = [
Tool("save_note", self.save_note, "Save information for later"),
Tool("get_note", self.get_note, "Retrieve saved information"),
Tool("list_notes", self.list_notes, "List all saved note keys"),
]
return await self.agent.run(messages, tools=tools)
def save_note(self, key: str, content: str) -> str:
self.notes[key] = content
return f"Saved note: {key}"
def get_note(self, key: str) -> str:
return self.notes.get(key, f"No note found for key: {key}")class OrchestratorAgent:
def __init__(self):
self.sub_agents = {
"researcher": ResearchAgent(),
"coder": CodingAgent(),
"reviewer": ReviewAgent(),
}
async def delegate(self, task: str, agent_type: str) -> str:
"""Delegate to sub-agent, receive condensed summary."""
agent = self.sub_agents[agent_type]
# Sub-agent works with fresh context
result = await agent.run(task)
# Return only essential findings to main context
return result.summary # Not the full conversationclass SessionMemory:
def __init__(
self,
keep_last_n_turns: int = 5,
context_limit: int = 100_000, # tokens
summarizer = None,
):
self.keep_last_n_turns = keep_last_n_turns
self.context_limit = context_limit
self.summarizer = summarizer
self.messages = []
self.summary = ""
async def add_message(self, message: dict):
self.messages.append(message)
await self._maybe_compact()
async def _maybe_compact(self):
current_tokens = estimate_tokens(self.messages)
if current_tokens > self.context_limit * 0.8:
# Summarize all but recent messages
old_messages = self.messages[:-self.keep_last_n_turns]
new_summary = await self.summarizer.summarize(
old_messages,
previous_summary=self.summary
)
self.summary = new_summary
self.messages = self.messages[-self.keep_last_n_turns:]
def get_context(self) -> list:
context = []
if self.summary:
context.append({
"role": "system",
"content": f"Conversation summary:\n{self.summary}"
})
context.extend(self.messages)
return contextdef estimate_tokens(messages: list) -> int:
"""Rough token estimation (4 chars ≈ 1 token for English)."""
total_chars = sum(
len(m.get("content", ""))
for m in messages
)
return total_chars // 4
def estimate_tokens_accurate(messages: list, model: str) -> int:
"""Accurate token count using tiktoken."""
import tiktoken
encoding = tiktoken.encoding_for_model(model)
return sum(
len(encoding.encode(m.get("content", "")))
for m in messages
)