Skip to main content

GSPO: Group Sequence Policy Optimization

GSPO is a policy gradient method for fine-tuning language models via reinforcement learning. It uses pipelined rollouts for efficient on-policy training.

When to Use

  • Training model weights (not just prompts)
  • Multi-turn agent tasks
  • When you have a reward signal from environment interaction
  • Scaling to larger models with GPU training

Config Reference

[algorithm]
type = "online"
method = "policy_gradient"
variety = "gspo"

[services]
task_url = "https://your-tunnel.trycloudflare.com"
judge_url = "https://synth-backend.onrender.com/api"  # Optional

[compute]
gpu_type = "H100"
gpu_count = 2
nodes = 1

[topology]
type = "single_node_split"
gpus_for_vllm = 1
gpus_for_training = 1
gpus_for_ref = 0
tensor_parallel = 1

[vllm]
tensor_parallel_size = 1
max_model_len = 8192

[reference]
placement = "none"  # or "dedicated"
port = 8002
tp = 1
health_max_wait_s = 180
health_interval_ms = 300

[model]
base = "Qwen/Qwen3-4B"  # OR source = "ft:checkpoint_id"
trainer_mode = "lora"  # or "full"
label = "my-rl-model"

[rollout]
env_name = "my-task"
policy_name = "my-policy"
max_turns = 10
episodes_per_batch = 32
max_concurrent_rollouts = 8
batches_per_step = 2
ops = ["agent", "env"]

[evaluation]
instances = 50
every_n_iters = 10
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

[training]
num_epochs = 1
iterations_per_epoch = 20
max_turns = 10
batch_size = 16
group_size = 4
gradient_accumulation_steps = 1
learning_rate = 5e-5
log_interval = 1
weight_sync_interval = 1

# Reward configuration
step_rewards_enabled = true
step_rewards_mode = "decision_stepwise"  # or "off", "env_sparse"
step_rewards_beta = 0.0
step_rewards_indicator_lambda = 1.0
event_rewards_kind = "unique"  # or "absolute"

[training.weight_sync]
enable = true
targets = ["policy"]

[training.lora]
r = 16
alpha = 32
dropout = 0.1
target_modules = ["q_proj", "v_proj"]

[judge]
enabled = false
type = "synth"
timeout_s = 30

[judge.reward_blend]
env = 1.0
event = 0.0
outcome = 0.0

[judge.options]
event = false
outcome = false
provider = "synth"
model = "synth-judge-v1"

Algorithm Config [algorithm]

ParameterTypeDefaultDescription
typestring-Must be "online" for RL
methodstring-"policy_gradient" or "ppo"
varietystring-"gspo" for Group Sequence Policy Optimization

Services Config [services]

ParameterTypeDefaultDescription
task_urlstring-URL of your task app (tunnel URL)
judge_urlstringnullOptional judge service URL

Compute Config [compute]

ParameterTypeDefaultDescription
gpu_typestring-GPU SKU: "H100", "H200", "A100"
gpu_countint-Number of GPUs
nodesint1Number of nodes

Topology Config [topology]

ParameterTypeDefaultDescription
typestring-"single_node_split"
gpus_for_vllmint-GPUs for inference server
gpus_for_trainingint-GPUs for training
gpus_for_refint-GPUs for reference model
tensor_parallelint-Tensor parallelism degree

vLLM Config [vllm]

ParameterTypeDefaultDescription
tensor_parallel_sizeint-TP size for inference
max_model_lenint8192Max sequence length

Reference Config [reference]

ParameterTypeDefaultDescription
placementstring-"none" or "dedicated"
portint8002Reference server port
tpint1Tensor parallelism
health_max_wait_sint180Health check timeout
health_interval_msint300Health check interval

Model Config [model]

ParameterTypeDefaultDescription
basestring-Base model (e.g., "Qwen/Qwen3-4B")
sourcestring-OR checkpoint ID (e.g., "ft:abc123")
trainer_modestring-"lora", "full", or "qlora"
labelstring-Model identifier/name
Note: Set exactly one of base or source

Rollout Config [rollout]

ParameterTypeDefaultDescription
env_namestring-Environment/task name
policy_namestring-Policy identifier
max_turnsint-Max steps per episode
episodes_per_batchint-Episodes per training batch
max_concurrent_rolloutsint-Max concurrent rollouts
batches_per_stepintnullBatches per training step
opslist[str]nullOperations: ["agent", "env"]
env_configdictnullEnvironment-specific config
policy_configdictnullPolicy-specific config

Evaluation Config [evaluation]

ParameterTypeDefaultDescription
instancesint-Evaluation instances
every_n_itersint-Evaluate every N iterations
seedslist[int]-Evaluation seeds

Training Config [training]

ParameterTypeDefaultDescription
num_epochsint-Number of training epochs
iterations_per_epochint-Iterations per epoch
max_turnsint-Max turns during training
batch_sizeint-Training batch size
group_sizeint-GSPO group size
learning_ratefloat-Optimizer learning rate
gradient_accumulation_stepsintnullGradient accumulation
max_accumulated_minibatchintnullMax accumulated minibatch
log_intervalintnullLog every N steps
weight_sync_intervalintnullSync weights every N steps

Reward Config (in [training])

ParameterTypeDefaultDescription
step_rewards_enabledboolnullEnable step-level rewards
step_rewards_modestringnull"off", "decision_stepwise", "env_sparse"
step_rewards_betafloatnullStep reward coefficient
step_rewards_indicator_lambdafloatnullIndicator lambda
step_rewards_strategystringnullReward strategy
event_rewards_kindstringnull"unique" or "absolute"

Weight Sync Config [training.weight_sync]

ParameterTypeDefaultDescription
enableboolnullEnable weight sync
targetslist[str]nullSync targets: ["policy"]
modestringnullSync mode
directboolnullDirect sync
verify_every_kintnullVerify every K syncs

LoRA Config [training.lora]

ParameterTypeDefaultDescription
rint16LoRA rank
alphaint32LoRA alpha
dropoutfloat0.1LoRA dropout
target_moduleslist[str]-Modules to apply LoRA

Judge Config [judge]

ParameterTypeDefaultDescription
enabledboolfalseEnable judge scoring
typestringnullJudge type: "synth"
timeout_sintnullJudge timeout

Judge Reward Blend [judge.reward_blend]

ParameterTypeDefaultDescription
envfloat1.0Weight for environment reward
eventfloat0.0Weight for event reward
outcomefloat0.0Weight for outcome reward

Judge Options [judge.options]

ParameterTypeDefaultDescription
eventboolnullEnable event judging
outcomeboolnullEnable outcome judging
providerstringnullJudge provider
modelstringnullJudge model
rubric_idstringnullRubric identifier
rubric_overridesdictnullRubric overrides
trackslist[str]nullTracks to judge
weightsdictnullTrack weights
max_concurrencyintnullMax concurrent judge calls

Returns

from synth_ai.sdk.api.train.rl import RLJob

job = RLJob.from_config("rl.toml")
job.submit()
result = job.poll_until_complete()

# Get results
print(f"Status: {result['status']}")
print(f"Final Reward: {result.get('final_reward', 'N/A')}")

# Get model ID
model_id = result.get("model_id")
# e.g., "ft:Qwen/Qwen3-0.6B:job_658ba4f3a93845aa"

Results Structure

{
    "status": "succeeded",
    "final_reward": 0.85,
    "model_id": "ft:Qwen/Qwen3-0.6B:job_abc123",
    "checkpoints": [
        {"step": 100, "path": "..."},
        {"step": 200, "path": "..."},
    ],
}

Using Your Model

Dev Inference (testing):
from synth_ai.sdk import InferenceClient

client = InferenceClient(
    base_url="https://agent-learning.onrender.com",
    api_key=os.environ["SYNTH_API_KEY"],
)

response = await client.create_chat_completion(
    model="ft:Qwen/Qwen3-0.6B:job_abc123",
    messages=[{"role": "user", "content": "Hello!"}],
)
Export to HuggingFace:
uvx synth-ai artifacts export ft:Qwen/Qwen3-0.6B:job_abc123 \
  --repo-id myorg/my-rl-model \
  --private

List Your Models

uvx synth-ai status models --type rl