Module 6 Project: PSRO for Satellite Constellation Coverage
What you are building
You will implement the PSRO outer loop for a two-player satellite constellation coverage game. Each player controls a set of satellites that can observe one orbital slot per turn. Players compete for coverage of a shared region. The project builds the complete PSRO pipeline: simulating policy rollouts to fill a payoff matrix, solving the meta-game Nash with linear programming, training RL best-response oracles, and iterating until the population converges. By the end you will have a working PSRO loop and a policy population that plays a non-trivial coverage strategy.
The game
Players: Two operators, each controlling 3 satellites.
State: Each satellite has a current orbital slot assignment (slots 0–7, arranged in a ring). The full state is 6 slot assignments (3 per player).
Actions: Each player simultaneously assigns each of their 3 satellites to a slot. Assignments are made without observing the opponent's assignments first (simultaneous-move game).
Payoff: Computed after both players commit:
- Each uniquely covered slot scores +1 for the covering player
- If both players cover the same slot, neither scores (contested, +0 each)
- Each player's score is the count of uniquely covered slots minus 0.5 × contested slots
This is a zero-sum game: total payoff sums to the number of non-contested covered slots.
Step 1: the environment
import numpy as np
from itertools import combinations
N_SLOTS = 8
N_SATS = 3
def compute_payoff(assign_p1: list[int], assign_p2: list[int]) -> tuple[float, float]:
set1 = set(assign_p1)
set2 = set(assign_p2)
contested = set1 & set2
unique1 = set1 - contested
unique2 = set2 - contested
p1 = len(unique1) - 0.5 * len(contested)
p2 = len(unique2) - 0.5 * len(contested)
return p1, p2
# All possible assignments: choose N_SATS distinct slots from N_SLOTS
ALL_ACTIONS = list(combinations(range(N_SLOTS), N_SATS))
N_ACTIONS = len(ALL_ACTIONS) # C(8,3) = 56
ACTION_INDEX = {a: i for i, a in enumerate(ALL_ACTIONS)}
print(f"Action space size: {N_ACTIONS}")
Step 2: build the full payoff matrix
def build_payoff_matrix() -> np.ndarray:
M = np.zeros((N_ACTIONS, N_ACTIONS))
for i, a1 in enumerate(ALL_ACTIONS):
for j, a2 in enumerate(ALL_ACTIONS):
p1, _ = compute_payoff(list(a1), list(a2))
M[i, j] = p1
return M
M_full = build_payoff_matrix()
print(f"Payoff matrix shape: {M_full.shape}")
Step 3: Nash solver for the meta-game
from scipy.optimize import linprog
def solve_nash_zero_sum(M: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
"""Solve a zero-sum normal-form game via LP. Returns (sigma_1, sigma_2)."""
n, m = M.shape
# Player 1: maximize min expected payoff
c = np.zeros(n + 1)
c[-1] = -1.0
A_ub = np.hstack([-M.T, np.ones((m, 1))])
b_ub = np.zeros(m)
A_eq = np.ones((1, n + 1)); A_eq[0, -1] = 0
b_eq = np.array([1.0])
bounds = [(0, None)] * n + [(None, None)]
r1 = linprog(c, A_ub=A_ub, b_ub=b_ub, A_eq=A_eq, b_eq=b_eq, bounds=bounds)
sigma_1 = np.maximum(r1.x[:n], 0); sigma_1 /= sigma_1.sum()
# Player 2: minimize max expected payoff
c2 = np.zeros(m + 1); c2[-1] = 1.0
A_ub2 = np.hstack([M, -np.ones((n, 1))])
b_ub2 = np.zeros(n)
A_eq2 = np.ones((1, m + 1)); A_eq2[0, -1] = 0
b_eq2 = np.array([1.0])
bounds2 = [(0, None)] * m + [(None, None)]
r2 = linprog(c2, A_ub=A_ub2, b_ub=b_ub2, A_eq=A_eq2, b_eq=b_eq2, bounds=bounds2)
sigma_2 = np.maximum(r2.x[:m], 0); sigma_2 /= sigma_2.sum()
return sigma_1, sigma_2
Step 4: best-response oracle
def best_response_p1(sigma_2: np.ndarray, population_indices: list[int]) -> int:
expected_payoffs = np.zeros(N_ACTIONS)
for k, j in enumerate(population_indices):
expected_payoffs += sigma_2[k] * M_full[:, j]
return int(np.argmax(expected_payoffs))
def best_response_p2(sigma_1: np.ndarray, population_indices: list[int]) -> int:
expected_payoffs = np.zeros(N_ACTIONS)
for k, i in enumerate(population_indices):
expected_payoffs += sigma_1[k] * M_full[i, :]
return int(np.argmin(expected_payoffs))
Step 5: the PSRO outer loop
def run_psro(n_iterations: int = 20, seed: int = 42) -> dict:
rng = np.random.default_rng(seed)
pop1 = [int(rng.integers(N_ACTIONS))]
pop2 = [int(rng.integers(N_ACTIONS))]
history = []
for iteration in range(n_iterations):
M_restricted = M_full[np.ix_(pop1, pop2)]
sigma_1, sigma_2 = solve_nash_zero_sum(M_restricted)
meta_nash_value = float(sigma_1 @ M_restricted @ sigma_2)
br1 = best_response_p1(sigma_2, pop2)
br2 = best_response_p2(sigma_1, pop1)
br1_value = float(M_full[br1, :][pop2] @ sigma_2)
br2_value = float(M_full[:, br2][pop1] @ sigma_1)
exploitability = (br1_value - meta_nash_value) + (meta_nash_value - (-br2_value))
history.append({
"iteration": iteration,
"pop_size": len(pop1),
"meta_nash_value": meta_nash_value,
"exploitability": exploitability,
})
print(f"Iter {iteration:2d} | pop={len(pop1):2d} | "
f"value={meta_nash_value:+.3f} | exploit={exploitability:.4f}")
if br1 not in pop1:
pop1.append(br1)
if br2 not in pop2:
pop2.append(br2)
if exploitability < 1e-4:
print(f"Converged at iteration {iteration}.")
break
return {"history": history, "pop1": pop1, "pop2": pop2}
results = run_psro()
Step 6: analyze results
import matplotlib.pyplot as plt
history = results["history"]
iters = [h["iteration"] for h in history]
exploits = [h["exploitability"] for h in history]
values = [h["meta_nash_value"] for h in history]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.semilogy(iters, exploits, "b-o", markersize=4)
ax1.set_xlabel("PSRO iteration"); ax1.set_ylabel("Exploitability (log scale)")
ax1.set_title("Convergence to Nash"); ax1.grid(True, alpha=0.3)
ax2.plot(iters, values, "r-o", markersize=4)
ax2.set_xlabel("PSRO iteration"); ax2.set_ylabel("Meta-Nash value (player 1)")
ax2.set_title("Nash equilibrium value over iterations")
ax2.axhline(0, color="k", linestyle="--", alpha=0.5, label="zero-sum balance")
ax2.legend(); ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("psro_convergence.png", dpi=150)
plt.show()
# Print final Nash strategies
final = history[-1]
print("\nFinal Nash mixture (player 1):")
for idx, w in zip(results["pop1"], results["history"][-1]["meta_nash_value"] * np.ones(len(results["pop1"]))):
slots = ALL_ACTIONS[idx]
print(f" slots={slots}")
What to observe
-
Exploitability drops to near zero within 5–10 PSRO iterations for this game. The restricted Nash converges to the full-game Nash because the best-response oracle quickly identifies the dominant pure strategies.
-
The Nash value is near zero — in this symmetric game, neither player can guarantee more than equal expected coverage at equilibrium.
-
Population size at convergence is typically 4–8 pure strategies. Inspect which slot assignments appear at high weight: they will tend to spread satellites evenly to minimize conflict with the opponent.
-
Deviation from Nash is costly: compute the expected payoff when player 1 plays a fixed assignment (e.g., slots 0, 1, 2) against the Nash mixture of player 2. Compare to the Nash value. The gap measures how exploitable a predictable player is.
-
Extension: replace the analytical best-response with the RL oracle from lesson 3 to see how the loop behaves when the oracle is approximate.