Loading...
Loading...
Write Harbor task verifiers using Reward Kit. Use when creating or editing a task's tests/ directory, adding grading criteria, setting up LLM/agent judges, or designing verifiers that produce a reward score.
npx skill4agent add harbor-framework/harbor rewardkittest.shtests/tests/
├── test.sh
├── checks.py # programmatic criteria
└── judge.toml # optional LLM/agent judgetests/test.sh#!/bin/bash
uvx --from 'harbor-rewardkit==0.1.*' rewardkit /tests/tests//app/logs/verifier/reward.jsontask.toml[verifier.env]
ANTHROPIC_API_KEY = "${ANTHROPIC_API_KEY}"[environment]
network_mode = "no-network" # Agent env baseline — offline during agent.run()
[verifier]
environment_mode = "separate"
[verifier.environment]
network_mode = "public" # Verifier env baseline — LLM judge API calls
docker_image = "python:3.12-slim"[environment].network_mode[verifier].network_modeenvironment_mode = "separate"[verifier.environment].network_modepublicno-networktests//tests/test.shtests/.pytests/import rewardkit as rk
rk.file_exists("output.txt")
rk.file_contains("output.txt", "hello")
rk.command_succeeds("python main.py", weight=2.0)
rk.json_key_equals("result.json", "status", "ok")weight1.0isolatedFalsefile_existsfile_not_existsfile_containsfile_contains_regexfile_matchesfiles_equaldiff_ratiocommand_succeedscommand_output_containscommand_output_matchescommand_output_matches_regexcwdjson_key_equalsjson_path_equalscsv_cell_equalsxlsx_cell_equals[office]sqlite_query_equalshttp_status_equalshttp_response_containsimage_similarityimage_size_equals[image]trajectory_tool_usedtrajectory_tool_not_usedtrajectory_turn_countuv tool install harbor-rewardkit[all]@criterionworkspace: Pathboolfloatfrom pathlib import Path
from rewardkit import criterion
@criterion
def has_valid_output(workspace: Path) -> bool:
return (workspace / "output.txt").read_text().strip() != ""rk@criterion(description="output has at least {n} lines")
def has_n_lines(workspace: Path, n: int) -> bool:
return len((workspace / "output.txt").read_text().splitlines()) >= n
rk.has_n_lines(10, weight=2.0)
rk.has_n_lines(50, weight=1.0)shared=True[judge]
judge = "anthropic/claude-sonnet-4-6" # LiteLLM model string
files = ["/app/main.py"]
[[criterion]]
description = "Is the code correct?"
type = "binary"
[[criterion]]
description = "How readable is the code?"
type = "likert"
points = 5
weight = 2.0binarylikertnumeric[judge]
judge = "claude-code"
model = "anthropic/claude-sonnet-4-6"
isolated = true
[[criterion]]
description = "Does the solution handle edge cases?"
type = "binary"[judge]timeoutreasoning_effortlowmediumhighreferenceatif-trajectoryweightprompt_template{criteria}[scoring]
aggregation = "all_pass" # weighted_mean | all_pass | any_pass | threshold
threshold = 0.7 # only for thresholdtests/
├── test.sh
├── correctness/
│ └── check.py
├── structure/
│ └── files_exist.py
└── quality/
└── quality.toml{ "correctness": 0.75, "structure": 1.0, "quality": 0.6 }tests/reward.toml[[reward]]reward.json[scoring]# tests/reward.toml
[[reward]]
name = "reward"
aggregation = "all_pass" # weighted_mean | all_pass | any_pass | threshold
# threshold = 0.7 # only for threshold{ "correctness": 0.75, "structure": 1.0, "quality": 0.6, "reward": 0.0 }namereward-details.json/logs/verifier/reward.json/logs/verifier/reward-details.jsontests/steps/{name}/tests//tests/app/logs/verifier/reward.jsonmulti_step_reward_strategytask.tomltests//teststests/checks.pyshared=Truetest.shsteps/foo/tests/correctness/structure/quality/multi_step_reward_strategy = "mean""final"@criterionisolated=Trueexamples/tasks/reward-kit-example/