Skip to main content

Outcome Rewards

Outcome rewards are episode-level summaries attached to complete sessions. They provide a single quality score for the entire episode, enabling efficient filtering, ranking, and analysis.

Overview

Outcome rewards capture session-level outcomes:
  • Total reward (e.g., unique achievements unlocked)
  • Achievement count
  • Step count (episode length)
  • Custom metadata (achievements list, final state, etc.)
Unlike event rewards (which are per-step), outcome rewards summarize the entire episode. They are used for:
  • Filtering: Select high-quality episodes for SFT training
  • Ranking: Sort episodes by performance
  • Analysis: Track policy improvement over time

Schema

class OutcomeReward:
    id: int                      # Primary key
    session_id: str             # FK to session_traces.session_id
    total_reward: int           # Total episode reward (e.g., unique achievements)
    achievements_count: int     # Number of achievements unlocked
    total_steps: int            # Episode length
    reward_metadata: dict | None  # JSON metadata (achievements list, etc.)
    created_at: datetime        # Timestamp

Fields

session_id (required)
  • Foreign key to session_traces.session_id
  • Links this outcome to a specific session/episode
total_reward (required)
  • Scalar summary of episode performance
  • Common interpretations:
    • Unique achievements unlocked this episode
    • Cumulative environment reward
    • Final score from evaluator
    • Any episode-level metric
achievements_count (required)
  • Number of achievements/milestones reached
  • Useful for environments with discrete goals
total_steps (required)
  • Number of steps/turns in the episode
  • Used for:
    • Filtering by episode length
    • Computing efficiency metrics (reward per step)
reward_metadata (optional)
  • JSON dictionary with additional context:
    {
        "achievements_list": ["collect_wood", "craft_pickaxe", "defeat_zombie"],
        "final_state": {"health": 9, "inventory": {...}},
        "terminated": true,
        "termination_reason": "success",
        "env_cumulative_reward": 12.5,
    }
    

Recording Outcome Rewards

From Task Apps

Record outcome rewards at the end of rollout execution:
from synth_ai.tracing_v3 import SessionTracer

# At end of episode
tracer = SessionTracer(session_id="episode_001")

# Record outcome
tracer.record_outcome_reward(
    total_reward=7,  # 7 unique achievements unlocked
    achievements_count=7,
    total_steps=42,
    reward_metadata={
        "achievements_list": [
            "collect_wood",
            "craft_pickaxe",
            "craft_sword",
            "defeat_zombie",
            "defeat_skeleton",
            "place_stone",
            "place_table",
        ],
        "final_health": 8,
        "terminated": True,
    },
)

Manual Recording

import sqlite3

conn = sqlite3.connect("traces.db")
cursor = conn.cursor()

cursor.execute(
    """
    INSERT INTO outcome_rewards (
        session_id,
        total_reward,
        achievements_count,
        total_steps,
        reward_metadata
    ) VALUES (?, ?, ?, ?, ?)
    """,
    (
        "episode_001",
        10,
        10,
        50,
        json.dumps({"achievements": [...], "final_state": {...}}),
    )
)
conn.commit()

Querying Outcome Rewards

Get Outcome for a Session

import sqlite3

conn = sqlite3.connect("traces.db")
cursor = conn.cursor()

cursor.execute(
    """
    SELECT 
        total_reward,
        achievements_count,
        total_steps,
        reward_metadata
    FROM outcome_rewards
    WHERE session_id = ?
    """,
    ("episode_001",)
)

row = cursor.fetchone()
if row:
    total_reward, achievements_count, total_steps, metadata = row
    print(f"Reward: {total_reward}, Achievements: {achievements_count}, Steps: {total_steps}")
    print(f"Metadata: {json.loads(metadata)}")

Filter High-Quality Episodes

# Get episodes with at least 5 achievements and 10 steps
cursor.execute(
    """
    SELECT 
        session_id,
        total_reward,
        achievements_count,
        total_steps
    FROM outcome_rewards
    WHERE 
        achievements_count >= 5
        AND total_steps >= 10
    ORDER BY total_reward DESC
    LIMIT 100
    """
)

high_quality_sessions = cursor.fetchall()

Compute Statistics

cursor.execute(
    """
    SELECT 
        COUNT(*) as num_episodes,
        AVG(total_reward) as avg_reward,
        MAX(total_reward) as max_reward,
        AVG(total_steps) as avg_steps,
        AVG(CAST(total_reward AS FLOAT) / total_steps) as avg_reward_per_step
    FROM outcome_rewards
    """
)

stats = cursor.fetchone()
print(f"Episodes: {stats[0]}")
print(f"Avg Reward: {stats[1]:.2f}")
print(f"Max Reward: {stats[2]}")
print(f"Avg Steps: {stats[3]:.1f}")
print(f"Avg Reward/Step: {stats[4]:.3f}")

Use Cases

1. Filter for SFT Training

# Use CLI to filter high-quality episodes
uvx synth-ai filter \
  --min-reward 5.0 \
  --min-steps 10 \
  --max-steps 100 \
  --output high_quality.jsonl
Python equivalent:
def export_sft_data(
    db_path: str,
    min_reward: float,
    min_steps: int,
    max_steps: int,
    output_path: str,
):
    """Export high-quality sessions for SFT."""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Get qualifying sessions
    cursor.execute(
        """
        SELECT session_id
        FROM outcome_rewards
        WHERE 
            total_reward >= ?
            AND total_steps >= ?
            AND total_steps <= ?
        ORDER BY total_reward DESC
        """,
        (min_reward, min_steps, max_steps)
    )
    
    session_ids = [row[0] for row in cursor.fetchall()]
    
    # Export traces to JSONL
    with open(output_path, "w") as f:
        for session_id in session_ids:
            trace = get_session_trace(conn, session_id)
            f.write(json.dumps(trace) + "\n")

2. Track Policy Improvement

def track_policy_performance(db_path: str, policy_name: str) -> dict:
    """Track performance over policy iterations."""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    cursor.execute(
        """
        SELECT 
            st.metadata->>'policy_iteration' as policy_iter,
            AVG(or.total_reward) as avg_reward,
            COUNT(*) as num_episodes
        FROM outcome_rewards or
        JOIN session_traces st ON or.session_id = st.session_id
        WHERE st.metadata->>'policy_name' = ?
        GROUP BY policy_iter
        ORDER BY CAST(policy_iter AS INTEGER)
        """,
        (policy_name,)
    )
    
    performance = {}
    for policy_iter, avg_reward, num_episodes in cursor.fetchall():
        performance[int(policy_iter)] = {
            "avg_reward": avg_reward,
            "num_episodes": num_episodes,
        }
    
    return performance

3. Combine with Event Rewards

def compute_weighted_score(
    session_id: str,
    conn: sqlite3.Connection,
    weights: dict,
) -> float:
    """
    Compute weighted combination of event and outcome rewards.
    
    Args:
        session_id: Session to score
        conn: Database connection
        weights: {"event": 0.3, "outcome": 0.7}
    """
    cursor = conn.cursor()
    
    # Get outcome reward
    cursor.execute(
        "SELECT total_reward FROM outcome_rewards WHERE session_id = ?",
        (session_id,)
    )
    outcome = cursor.fetchone()
    outcome_reward = outcome[0] if outcome else 0.0
    
    # Sum event rewards
    cursor.execute(
        "SELECT SUM(reward_value) FROM event_rewards WHERE session_id = ?",
        (session_id,)
    )
    event_sum = cursor.fetchone()
    event_reward = event_sum[0] if event_sum else 0.0
    
    # Weighted combination
    score = (
        weights["event"] * event_reward +
        weights["outcome"] * outcome_reward
    )
    
    return score

4. Export Top-K Episodes

def export_top_k_episodes(
    db_path: str,
    k: int,
    output_dir: str,
):
    """Export top-k highest-reward episodes."""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Get top-k sessions
    cursor.execute(
        """
        SELECT 
            session_id,
            total_reward,
            achievements_count,
            total_steps
        FROM outcome_rewards
        ORDER BY total_reward DESC
        LIMIT ?
        """,
        (k,)
    )
    
    for session_id, reward, ach_count, steps in cursor.fetchall():
        # Get full trace
        trace = get_session_trace(conn, session_id)
        
        # Export to file
        filename = f"{session_id}_r{reward}_a{ach_count}_s{steps}.json"
        filepath = os.path.join(output_dir, filename)
        with open(filepath, "w") as f:
            json.dump(trace, f, indent=2, default=str)
        
        print(f"Exported: {filename}")

Integration with Judges

Judges can populate outcome rewards automatically:
from synth_ai.judge_schemas import JudgeScoreResponse

async def score_and_record_outcome(
    session_id: str,
    trace: dict,
    tracer: SessionTracer,
):
    """Score episode with judge and record outcome."""
    # Score with judge
    response: JudgeScoreResponse = await judge_client.score(
        JudgeScoreRequest(
            policy_name="my-policy",
            task_app=JudgeTaskApp(id="crafter"),
            trace=JudgeTracePayload(**trace),
            options=JudgeOptions(outcome=True),
        )
    )
    
    # Extract outcome score
    outcome_total = response.aggregate_outcome_reward() or 0.0
    
    # Record as outcome reward
    tracer.record_outcome_reward(
        total_reward=int(outcome_total),
        achievements_count=len(trace["metadata"].get("achievements", [])),
        total_steps=len(trace["event_history"]),
        reward_metadata={
            "outcome_review": response.outcome_review.dict() if response.outcome_review else None,
            "judge_provider": response.details.get("provider"),
        },
    )

Best Practices

1. Choose Meaningful total_reward

# Good: Use unique achievements (episode-specific)
total_reward = len(unique_achievements_this_episode)

# Good: Use normalized cumulative reward
total_reward = sum(env_rewards) / max_possible_reward

# Bad: Use raw cumulative reward (scale depends on episode length)
total_reward = sum(env_rewards)  # Unfair comparison across lengths

2. Populate reward_metadata

# Good: Rich metadata
tracer.record_outcome_reward(
    total_reward=7,
    achievements_count=7,
    total_steps=42,
    reward_metadata={
        "achievements_list": [...],
        "final_state": {...},
        "terminated": True,
        "termination_reason": "success",
        "env_cumulative_reward": 12.5,
        "unique_achievements": 7,
        "total_achievements": 10,  # Some were repeated
    },
)

# Bad: No metadata
tracer.record_outcome_reward(
    total_reward=7,
    achievements_count=7,
    total_steps=42,
)

3. Normalize for Episode Length

# When filtering, consider reward per step
def get_normalized_score(total_reward: float, total_steps: int) -> float:
    """Compute reward per step."""
    return total_reward / max(1, total_steps)

cursor.execute(
    """
    SELECT 
        session_id,
        total_reward,
        total_steps,
        CAST(total_reward AS FLOAT) / total_steps as reward_per_step
    FROM outcome_rewards
    WHERE reward_per_step >= 0.1  -- High efficiency
    ORDER BY reward_per_step DESC
    """
)

4. Use Appropriate Thresholds

# Environment-specific thresholds
CRAFTER_THRESHOLDS = {
    "min_reward": 3,  # At least 3 achievements
    "min_steps": 10,  # At least 10 steps
    "max_steps": 200,  # Not too long
}

MATH_THRESHOLDS = {
    "min_reward": 1,  # Solved at least 1 problem
    "min_steps": 5,   # At least 5 reasoning steps
    "max_steps": 50,
}

def filter_by_env(env_name: str, conn: sqlite3.Connection):
    thresholds = {
        "crafter": CRAFTER_THRESHOLDS,
        "math": MATH_THRESHOLDS,
    }.get(env_name, {})
    
    cursor = conn.cursor()
    cursor.execute(
        """
        SELECT session_id
        FROM outcome_rewards or
        JOIN session_traces st ON or.session_id = st.session_id
        WHERE 
            st.metadata->>'environment_name' = ?
            AND or.total_reward >= ?
            AND or.total_steps >= ?
            AND or.total_steps <= ?
        """,
        (
            env_name,
            thresholds.get("min_reward", 0),
            thresholds.get("min_steps", 0),
            thresholds.get("max_steps", 1000),
        )
    )
    
    return [row[0] for row in cursor.fetchall()]

Relationship to Event Rewards

AspectOutcome RewardsEvent Rewards
ScopeEntire episodeIndividual steps
PurposeFiltering, rankingCredit assignment
When recordedEnd of episodeDuring episode
FK targetsession_idevent_id
Use in RLEpisode selectionAdvantage computation
Use in SFTData filteringStep-level filtering
Common pattern: Use outcome rewards to filter high-quality episodes, then use event rewards to select the best steps within those episodes.
# Step 1: Get high-quality episodes
cursor.execute(
    """
    SELECT session_id
    FROM outcome_rewards
    WHERE total_reward >= 5 AND total_steps >= 10
    """
)
good_episodes = [row[0] for row in cursor.fetchall()]

# Step 2: Get high-reward steps from those episodes
cursor.execute(
    """
    SELECT event_id
    FROM event_rewards
    WHERE 
        session_id IN ({})
        AND reward_type = 'unique_achievement_delta'
        AND reward_value > 0
    """.format(",".join("?" * len(good_episodes))),
    good_episodes
)
high_quality_steps = [row[0] for row in cursor.fetchall()]

See Also