Skip to content
Build
BuildBuilding Blocks

Harness

Spawn external coding agents (Claude Code, Codex, Gemini, OpenCode) from inside agent loops with budget caps, schema-bound output, and retries.

The building block for driving external coding agents from inside your loop. app.harness(prompt, schema=..., provider=...) spawns Claude Code, Codex, Gemini, or OpenCode as a managed subprocess (or SDK call), waits for structured output, validates it against your schema, and hands back a rich result with cost and retry metadata.

Without harness, you would manage subprocess plumbing, JSON repair, retry-with-backoff, schema validation, output-file cleanup, and per-provider quirks yourself. With harness, those become one call you can wrap in a loop, a tournament, or an adversarial pattern.

The shape

from pydantic import BaseModel
from agentfield import Agent, HarnessConfig

class ReviewResult(BaseModel):
    findings: list[str]
    severity: str  # "low" | "medium" | "high"

# Defaults pinned on the agent — every .harness(...) call inherits them.
app = Agent(
    node_id="reviewer",
    harness_config=HarnessConfig(
        provider="claude-code",
        model="sonnet",
        permission_mode="plan",
        max_turns=12,
        max_budget_usd=0.50,
        tools=["Read", "Grep", "Glob"],
    ),
)

@app.reasoner()
async def review_diff(diff: str) -> dict:
    result = await app.harness(
        f"Review this diff. Be precise, no fluff.\n\n{diff}",
        schema=ReviewResult,
    )

    if result.is_error:
        return {"ok": False, "error": result.error_message}

    return {
        "ok": True,
        "review": result.parsed.model_dump(),
        "cost_usd": result.cost_usd,
        "num_turns": result.num_turns,
        "session_id": result.session_id,
    }
import { Agent, type HarnessConfig } from '@agentfield/sdk';
import { z } from 'zod';

const ReviewResult = z.object({
  findings: z.array(z.string()),
  severity: z.enum(['low', 'medium', 'high']),
});

const harnessConfig: HarnessConfig = {
  provider: 'claude-code',
  model: 'sonnet',
  permissionMode: 'plan',
  maxTurns: 12,
  maxBudgetUsd: 0.5,
  tools: ['Read', 'Grep', 'Glob'],
};

const app = new Agent({ nodeId: 'reviewer', harnessConfig });

app.reasoner('review_diff', async (input: { diff: string }) => {
  const result = await app.harness(
    `Review this diff. Be precise, no fluff.\n\n${input.diff}`,
    { schema: ReviewResult },
  );

  if (result.isError) {
    return { ok: false, error: result.errorMessage };
  }

  return {
    ok: true,
    review: ReviewResult.parse(result.parsed),
    costUsd: result.costUsd,
    numTurns: result.numTurns,
    sessionId: result.sessionId,
  };
});
package main

import (
    "context"
    "fmt"

    "github.com/Agent-Field/agentfield/sdk/go/agent"
    "github.com/Agent-Field/agentfield/sdk/go/harness"
)

type ReviewResult struct {
    Findings []string `json:"findings"`
    Severity string   `json:"severity"`
}

func newReviewer() (*agent.Agent, error) {
    return agent.New(agent.Config{
        NodeID: "reviewer",
        HarnessConfig: &agent.HarnessConfig{
            Provider:       "claude-code",
            Model:          "sonnet",
            PermissionMode: "plan",
            MaxTurns:       12,
        },
    })
}

func reviewDiff(ctx context.Context, app *agent.Agent, diff string) (*ReviewResult, error) {
    var out ReviewResult
    schema, _ := harness.StructToJSONSchema(out)

    result, err := app.Harness(ctx,
        "Review this diff. Be precise, no fluff.\n\n"+diff,
        schema, &out,
        harness.Options{}, // empty — inherits HarnessConfig from the agent
    )
    if err != nil {
        return nil, err
    }
    if result.IsError {
        return nil, fmt.Errorf("harness: %s", result.ErrorMessage)
    }
    return &out, nil
}

HarnessConfig — defaults on the Agent

Pin defaults on the agent constructor; override per call only for the dimensions that genuinely vary by task.

FieldTypeDefaultWhat it does
providerstringrequired"claude-code" / "codex" / "gemini" / "opencode".
modelstring"sonnet"Provider-specific model identifier — e.g. "sonnet", "gpt-5-codex", "gemini-2.5-pro", "qwen/qwen3-coder".
max_turnsint30Hard cap on agent iterations.
max_budget_usdfloatnullCost ceiling. Provider aborts when total spend would exceed this.
max_retriesint3Retry attempts for transient errors (rate-limit, 5xx, connection reset).
initial_delay / max_delay / backoff_factorfloat1.0 / 30.0 / 2.0Exponential-backoff knobs for retries.
toolsstring[]["Read","Write","Edit","Bash","Glob","Grep"]Allowed tool names — gates what the coding agent may invoke.
permission_modestringnull"plan" (plan-first, then execute) or "auto" (bypass per-step prompts).
system_promptstringnullCustom system prompt prepended to the loop.
envdict{}Extra environment variables forwarded to the subprocess.
cwdstringworking dirWorking directory the coding agent treats as the repo root.
project_dirstringnullopencode only — maps to --dir. When set, cwd is used only for output-file placement.
codex_bin / gemini_bin / opencode_binstringbinary nameOverride CLI paths when the binary is not on $PATH.

HarnessResult — what comes back

FieldTypeWhat it is
result / textstringRaw agent response (last message text).
parsedmodel / nullValidated schema instance when schema= was passed. null if validation fell through all repair layers.
is_error / error_messagebool / stringTrue when the run failed terminally. Inspect error_message for the diagnosis.
cost_usdfloat / nullTotal cost reported by the provider. null when the provider does not surface cost.
num_turnsintIterations the agent took. Useful for cost monitoring and tuning max_turns.
session_idstringProvider session identifier — pass back in resume_session_id to continue a multi-turn run.
messageslistFull message stream (provider-specific shape). Inspect for debugging.

Schema-bound output

Pass a Pydantic model (Python), Zod schema (TypeScript), or Go struct as schema=. The runner injects an OUTPUT REQUIREMENTS suffix telling the agent to write JSON to .agentfield_output.json, then reads, repairs, validates, and returns result.parsed as a validated instance. Three recovery layers run before declaring failure:

  1. Parse the output file directly.
  2. Cosmetic repair — strip markdown fences, trailing commas, repair truncated braces.
  3. One-shot AI repair — re-emit the same content as valid JSON conforming to the schema (no tools, no exploration; cheap reformatting only).

After that, the run is retried up to max_retries times. The output file is cleaned up automatically.

Provider switching is a one-field flip

The four providers are interchangeable through provider=. Same loop code, different worker:

# Plan with Claude (careful reasoner), execute with Codex (fast implementer).
plan = await app.harness(prompt, provider="claude-code", permission_mode="plan", schema=ChangePlan)
edits = await app.harness(apply, provider="codex",       permission_mode="auto", schema=EditReport)
const plan  = await app.harness(prompt, { provider: 'claude-code', permissionMode: 'plan', schema: ChangePlan });
const edits = await app.harness(apply,  { provider: 'codex',       permissionMode: 'auto', schema: EditReport });
plan, _ := app.Harness(ctx, prompt, planSchema, &planOut,
    harness.Options{Provider: "claude-code", PermissionMode: "plan"})
edits, _ := app.Harness(ctx, apply, editSchema, &editOut,
    harness.Options{Provider: "codex", PermissionMode: "auto"})

See the four provider docs for the role each one plays best:

Common patterns

Retry with provider fallback

When the primary provider trips a budget or rate-limit, fall back to a cheaper one without losing the loop's intent.

async def hardened_harness(prompt: str, schema, **opts):
    for provider in ("claude-code", "opencode"):
        result = await app.harness(prompt, schema=schema, provider=provider, **opts)
        if not result.is_error and result.parsed is not None:
            return result
    raise RuntimeError(f"all providers failed: {result.error_message}")
async function hardenedHarness<T>(prompt: string, schema: T, opts: HarnessOptions = {}) {
  for (const provider of ['claude-code', 'opencode'] as const) {
    const r = await app.harness(prompt, { ...opts, provider, schema });
    if (!r.isError && r.parsed !== null) return r;
  }
  throw new Error('all providers failed');
}
for _, provider := range []string{"claude-code", "opencode"} {
    r, err := app.Harness(ctx, prompt, schema, dest,
        harness.Options{Provider: provider})
    if err == nil && !r.IsError {
        return r, nil
    }
}
return nil, errors.New("all providers failed")

Tournament

Run multiple providers in parallel against the same prompt and pick the best result with an LLM-as-judge call.

runs = await asyncio.gather(*[
    app.harness(prompt, provider=p, schema=ImplResult, max_budget_usd=0.05)
    for p in ("claude-code", "codex", "opencode")
])

verdict = await app.harness(
    "Pick the best implementation. Score on correctness and minimality.\n\n"
    + format_runs(runs),
    provider="claude-code",
    schema=Verdict,
)
return runs[verdict.parsed.winner_index]
const runs = await Promise.all(
  (['claude-code', 'codex', 'opencode'] as const).map((p) =>
    app.harness(prompt, { provider: p, schema: ImplResult, maxBudgetUsd: 0.05 }),
  ),
);
const verdict = await app.harness(
  `Pick the best implementation.\n\n${formatRuns(runs)}`,
  { provider: 'claude-code', schema: Verdict },
);
return runs[verdict.parsed.winnerIndex];
var runs []*harness.Result
for _, p := range []string{"claude-code", "codex", "opencode"} {
    r, _ := app.Harness(ctx, prompt, implSchema, &implOut,
        harness.Options{Provider: p, MaxBudgetUSD: 0.05})
    runs = append(runs, r)
}
v, _ := app.Harness(ctx, judgePrompt(runs), verdictSchema, &verdict,
    harness.Options{Provider: "claude-code"})

Adversarial verifier

One provider proposes a change, a second one verifies it. The verifier's output triggers re-runs until severity drops or the budget caps out.

for attempt in range(3):
    impl = await app.harness(prompt, provider="codex", schema=ImplResult)
    if impl.is_error:
        continue

    review = await app.harness(
        f"Find real bugs in this change:\n\n{impl.parsed.diff}",
        provider="claude-code",
        permission_mode="plan",
        schema=ReviewResult,
    )
    if review.parsed.severity in ("low", "medium"):
        return impl  # accepted

    prompt = f"Re-attempt. Previous reviewer findings:\n{review.parsed.findings}\n\n{prompt}"
for (let attempt = 0; attempt < 3; attempt++) {
  const impl = await app.harness(prompt, { provider: 'codex', schema: ImplResult });
  if (impl.isError) continue;

  const review = await app.harness(
    `Find real bugs in this change:\n\n${impl.parsed.diff}`,
    { provider: 'claude-code', permissionMode: 'plan', schema: ReviewResult },
  );
  if (review.parsed.severity !== 'high') return impl;
  prompt = `Re-attempt. Reviewer findings:\n${review.parsed.findings}\n\n${prompt}`;
}
for attempt := 0; attempt < 3; attempt++ {
    impl, _ := app.Harness(ctx, prompt, implSchema, &implOut,
        harness.Options{Provider: "codex"})
    if impl.IsError { continue }

    review, _ := app.Harness(ctx, reviewPrompt(implOut), reviewSchema, &reviewOut,
        harness.Options{Provider: "claude-code", PermissionMode: "plan"})
    if reviewOut.Severity != "high" { return implOut, nil }

    prompt = retryPrompt(reviewOut, prompt)
}

Authentication

Each provider reads its own credentials from the environment forwarded to the harness subprocess. None of these need to be passed through harness_config.env if they're already in the process environment.

ProviderRequired env vars
claude-codeANTHROPIC_API_KEY (or Vertex / Bedrock routing via claude_agent_sdk config)
codexOPENAI_API_KEY, or run codex login once for OAuth
geminiGEMINI_API_KEY, or run gemini auth login once for OAuth
opencodeWhatever its opencode auth login flow configures — OPENROUTER_API_KEY, OLLAMA_HOST, etc.

See also