cache-cost-tracking

Compare original and translation side by side

🇺🇸

Original

English

🇨🇳

Translation

Chinese

Cache Cost Tracking

缓存成本追踪

Monitor LLM costs and cache effectiveness.

监控LLM成本与缓存有效性。

Langfuse Automatic Tracking

Langfuse自动追踪

python

from langfuse.decorators import observe, langfuse_context

@observe(as_type="generation")
async def call_llm_with_cache(
    prompt: str,
    agent_type: str,
    analysis_id: UUID
) -> str:
    """LLM call with automatic cost tracking."""

    # Link to parent trace
    langfuse_context.update_current_trace(
        name=f"{agent_type}_generation",
        session_id=str(analysis_id)
    )

    # Check caches
    if cache_key in lru_cache:
        langfuse_context.update_current_observation(
            metadata={"cache_layer": "L1", "cache_hit": True}
        )
        return lru_cache[cache_key]

    similar = await semantic_cache.get(prompt, agent_type)
    if similar:
        langfuse_context.update_current_observation(
            metadata={"cache_layer": "L2", "cache_hit": True}
        )
        return similar

    # LLM call - Langfuse tracks tokens/cost automatically
    response = await llm.generate(prompt)

    langfuse_context.update_current_observation(
        metadata={
            "cache_layer": "L4",
            "cache_hit": False,
            "prompt_cache_hit": response.usage.cache_read_input_tokens > 0
        }
    )

    return response.content

python

from langfuse.decorators import observe, langfuse_context

@observe(as_type="generation")
async def call_llm_with_cache(
    prompt: str,
    agent_type: str,
    analysis_id: UUID
) -> str:
    """LLM调用自动成本追踪。"""

    # 关联至父追踪链路
    langfuse_context.update_current_trace(
        name=f"{agent_type}_generation",
        session_id=str(analysis_id)
    )

    # 检查缓存
    if cache_key in lru_cache:
        langfuse_context.update_current_observation(
            metadata={"cache_layer": "L1", "cache_hit": True}
        )
        return lru_cache[cache_key]

    similar = await semantic_cache.get(prompt, agent_type)
    if similar:
        langfuse_context.update_current_observation(
            metadata={"cache_layer": "L2", "cache_hit": True}
        )
        return similar

    # LLM调用 - Langfuse自动追踪令牌/成本
    response = await llm.generate(prompt)

    langfuse_context.update_current_observation(
        metadata={
            "cache_layer": "L4",
            "cache_hit": False,
            "prompt_cache_hit": response.usage.cache_read_input_tokens > 0
        }
    )

    return response.content

Hierarchical Cost Rollup

层级成本汇总

python

class AnalysisWorkflow:
    @observe(as_type="trace")
    async def run_analysis(self, url: str, analysis_id: UUID):
        """Parent trace aggregates child costs.

        Trace Hierarchy:
        run_analysis (trace)
        ├── security_agent (generation)
        ├── tech_agent (generation)
        └── synthesis (generation)
        """
        langfuse_context.update_current_trace(
            name="content_analysis",
            session_id=str(analysis_id),
            tags=["multi-agent"]
        )

        for agent in self.agents:
            await self.run_agent(agent, content, analysis_id)

    @observe(as_type="generation")
    async def run_agent(self, agent, content, analysis_id):
        """Child generation - costs roll up to parent."""
        langfuse_context.update_current_observation(
            name=f"{agent.name}_generation",
            metadata={"agent_type": agent.name}
        )
        return await agent.analyze(content)

python

class AnalysisWorkflow:
    @observe(as_type="trace")
    async def run_analysis(self, url: str, analysis_id: UUID):
        """父追踪链路汇总子节点成本。

        追踪层级结构：
        run_analysis (trace)
        ├── security_agent (generation)
        ├── tech_agent (generation)
        └── synthesis (generation)
        """
        langfuse_context.update_current_trace(
            name="content_analysis",
            session_id=str(analysis_id),
            tags=["multi-agent"]
        )

        for agent in self.agents:
            await self.run_agent(agent, content, analysis_id)

    @observe(as_type="generation")
    async def run_agent(self, agent, content, analysis_id):
        """子生成节点 - 成本汇总至父节点。"""
        langfuse_context.update_current_observation(
            name=f"{agent.name}_generation",
            metadata={"agent_type": agent.name}
        )
        return await agent.analyze(content)

Cost Queries

成本查询

python

from langfuse import Langfuse

async def get_analysis_costs(analysis_id: UUID) -> dict:
    langfuse = Langfuse()

    traces = langfuse.get_traces(session_id=str(analysis_id), limit=1)

    if traces.data:
        trace = traces.data[0]
        return {
            "total_cost": trace.total_cost,
            "input_tokens": trace.usage.input_tokens,
            "output_tokens": trace.usage.output_tokens,
            "cache_read_tokens": trace.usage.cache_read_input_tokens,
        }

async def get_costs_by_agent() -> list[dict]:
    generations = langfuse.get_generations(
        from_timestamp=datetime.now() - timedelta(days=7),
        limit=1000
    )

    costs = {}
    for gen in generations.data:
        agent = gen.metadata.get("agent_type", "unknown")
        if agent not in costs:
            costs[agent] = {"total": 0, "calls": 0, "cache_hits": 0}

        costs[agent]["total"] += gen.calculated_total_cost or 0
        costs[agent]["calls"] += 1
        if gen.metadata.get("cache_hit"):
            costs[agent]["cache_hits"] += 1

    return list(costs.values())

python

from langfuse import Langfuse

async def get_analysis_costs(analysis_id: UUID) -> dict:
    langfuse = Langfuse()

    traces = langfuse.get_traces(session_id=str(analysis_id), limit=1)

    if traces.data:
        trace = traces.data[0]
        return {
            "total_cost": trace.total_cost,
            "input_tokens": trace.usage.input_tokens,
            "output_tokens": trace.usage.output_tokens,
            "cache_read_tokens": trace.usage.cache_read_input_tokens,
        }

async def get_costs_by_agent() -> list[dict]:
    generations = langfuse.get_generations(
        from_timestamp=datetime.now() - timedelta(days=7),
        limit=1000
    )

    costs = {}
    for gen in generations.data:
        agent = gen.metadata.get("agent_type", "unknown")
        if agent not in costs:
            costs[agent] = {"total": 0, "calls": 0, "cache_hits": 0}

        costs[agent]["total"] += gen.calculated_total_cost or 0
        costs[agent]["calls"] += 1
        if gen.metadata.get("cache_hit"):
            costs[agent]["cache_hits"] += 1

    return list(costs.values())

Cache Effectiveness

缓存有效性

python

cache_hits = 0
cache_misses = 0
cost_saved = 0.0

for gen in generations:
    if gen.metadata.get("cache_hit"):
        cache_hits += 1
        cost_saved += estimate_full_cost(gen)
    else:
        cache_misses += 1

hit_rate = cache_hits / (cache_hits + cache_misses)
print(f"Cache Hit Rate: {hit_rate:.1%}")
print(f"Cost Saved: ${cost_saved:.2f}")

python

cache_hits = 0
cache_misses = 0
cost_saved = 0.0

for gen in generations:
    if gen.metadata.get("cache_hit"):
        cache_hits += 1
        cost_saved += estimate_full_cost(gen)
    else:
        cache_misses += 1

hit_rate = cache_hits / (cache_hits + cache_misses)
print(f"缓存命中率: {hit_rate:.1%}")
print(f"节省成本: ${cost_saved:.2f}")

Key Decisions

关键决策

Decision	Recommendation
Trace grouping	session_id = analysis_id
Cost attribution	metadata.agent_type
Query window	7-30 days
Dashboard	Langfuse web UI

决策项	建议方案
追踪分组	session_id = analysis_id
成本归因	metadata.agent_type
查询窗口	7-30天
仪表盘	Langfuse网页UI

Common Mistakes

常见误区

Not linking child to parent trace
Missing metadata for attribution
Not tracking cache hits separately
Ignoring prompt cache savings

未将子追踪链路关联至父追踪链路
缺失用于归因的元数据
未单独追踪缓存命中情况
忽略提示词缓存带来的成本节省

Related Skills

Capability Details

能力详情

prompt-caching

Keywords: prompt cache, cache prompt, prefix caching, cache breakpoints Solves:

Reduce token costs with cached prompts
Configure cache breakpoints
Implement provider-native caching

关键词: prompt cache, cache prompt, prefix caching, cache breakpoints 解决问题:

利用缓存提示词降低令牌成本
配置缓存断点
实现服务商原生缓存

response-caching

Keywords: response cache, semantic cache, cache response, LLM cache Solves:

Cache LLM responses for repeated queries
Implement semantic similarity caching
Reduce API calls with cached responses

关键词: response cache, semantic cache, cache response, LLM cache 解决问题:

为重复查询缓存LLM响应
实现语义相似度缓存
利用缓存响应减少API调用

cost-calculation

Keywords: cost, token cost, calculate cost, pricing, usage cost Solves:

Calculate token costs by model
Track input/output token pricing
Estimate cost before execution

关键词: cost, token cost, calculate cost, pricing, usage cost 解决问题:

按模型计算令牌成本
追踪输入/输出令牌定价
执行前预估成本

usage-tracking

Keywords: usage, track usage, token usage, API usage, metrics Solves:

Track LLM API usage over time
Monitor token consumption
Generate usage reports

关键词: usage, track usage, token usage, API usage, metrics 解决问题:

随时间追踪LLM API使用情况
监控令牌消耗
生成使用报告

cache-invalidation

Keywords: invalidate, cache invalidation, TTL, expire, refresh Solves:

Implement cache invalidation strategies
Configure TTL for cached responses
Handle stale cache entries

关键词: invalidate, cache invalidation, TTL, expire, refresh 解决问题:

实现缓存失效策略
为缓存响应配置TTL
处理过期缓存条目

cache-cost-tracking

Original

Translation

Cache Cost Tracking

缓存成本追踪

Langfuse Automatic Tracking

Langfuse自动追踪

Hierarchical Cost Rollup

层级成本汇总

Cost Queries

成本查询

Cache Effectiveness

缓存有效性

Key Decisions

关键决策

Common Mistakes

常见误区

Related Skills

相关技能

Capability Details

能力详情

prompt-caching

prompt-caching

response-caching

response-caching

cost-calculation

cost-calculation

usage-tracking

usage-tracking

cache-invalidation

cache-invalidation