a geicko-2 based round robin ranking system designed to test c++ battleship submissions battleship.dunkirk.sh

fix: implement proper Glicko-2 rating periods to eliminate last-submitter bias

The previous implementation updated Glicko-2 ratings after each individual
match, violating the system's core assumption of rating periods with 10-15+
games. This caused path-dependency where identical algorithms submitted at
different times received different final ratings based on opponent rating
states during their matches.

Changes:
- Refactored RecalculateAllGlicko2Ratings() to use proper rating periods
- Batches all match results per player together
- Uses opponent ratings at the start of the rating period
- Updates each player's rating once with all results
- Modified RunRoundRobinMatches() to store matches first, then recalculate
ratings once at the end (proper rating period)
- Removed per-match UpdateGlicko2Ratings() calls
- Added manual recalculation command: ./battleship-arena recalculate-ratings
- Updated leaderboard sorting: rating DESC → wins DESC → avg_moves ASC
- Pending/broken entries use avg_moves=999 to sort to bottom

Live updates still work - SSE broadcasts match progress in real-time.
Glicko ratings update at round-robin completion using proper rating periods.

References:
- Glicko-2 paper: http://www.glicko.net/glicko/glicko2.pdf
- Instant Glicko-2 implementation notes:
https://gist.github.com/gpluscb/302d6b71a8d0fe9f4350d45bc828f802

💘 Generated with Crush

Assisted-by: Claude Sonnet 4.5 via Crush <crush@charm.land>

dunkirk.sh 318520e3 9a95d7f7

verified
+155 -21
+12
AGENTS.md
··· 12 12 - **scp.go** - SCP upload middleware for file submissions 13 13 - **worker.go** - Background processor (runs every 30s) 14 14 15 + ## Glicko-2 Rating System 16 + 17 + **Important**: The system uses Glicko-2 ratings with **proper rating periods** to avoid last-submitter bias: 18 + 19 + - All matches in a round-robin are stored first 20 + - Ratings update **once at the end** using all match results together (proper rating period) 21 + - This eliminates path-dependency where identical algorithms get different ratings based on submission order 22 + - Each player's rating considers ALL their opponents' ratings at the start of the rating period 23 + - Glicko-2 expects 10-15+ games per rating period - our round-robin satisfies this 24 + 25 + **Manual recalculation**: Run `./battleship-arena recalculate-ratings` or `make recalculate-ratings` to recompute all ratings from scratch. 26 + 15 27 ## File Upload 16 28 17 29 Students upload via SCP:
+16 -10
Makefile
··· 48 48 @echo "Building for production..." 49 49 @CGO_ENABLED=1 go build -ldflags="-s -w" -o bin/battleship-arena ./cmd/battleship-arena 50 50 51 + # Recalculate all Glicko-2 ratings from scratch 52 + recalculate-ratings: build 53 + @echo "Recalculating all Glicko-2 ratings..." 54 + @./bin/battleship-arena recalculate-ratings 55 + 51 56 # Show help 52 57 help: 53 58 @echo "Available targets:" 54 - @echo " build - Build the server" 55 - @echo " run - Build and run the server" 56 - @echo " clean - Clean build artifacts" 57 - @echo " test - Run tests" 58 - @echo " gen-key - Generate SSH host key" 59 - @echo " fmt - Format code" 60 - @echo " lint - Lint code" 61 - @echo " deps - Update dependencies" 62 - @echo " build-prod - Build optimized production binary" 63 - @echo " help - Show this help" 59 + @echo " build - Build the server" 60 + @echo " run - Build and run the server" 61 + @echo " clean - Clean build artifacts" 62 + @echo " test - Run tests" 63 + @echo " gen-key - Generate SSH host key" 64 + @echo " fmt - Format code" 65 + @echo " lint - Lint code" 66 + @echo " deps - Update dependencies" 67 + @echo " build-prod - Build optimized production binary" 68 + @echo " recalculate-ratings - Recalculate all Glicko-2 ratings from scratch" 69 + @echo " help - Show this help"
battleship-arena

This is a binary file and will not be displayed.

+13
cmd/battleship-arena/main.go
··· 61 61 if err := initStorage(cfg); err != nil { 62 62 log.Fatal(err) 63 63 } 64 + 65 + // Check for special commands 66 + if len(os.Args) > 1 { 67 + switch os.Args[1] { 68 + case "recalculate-ratings": 69 + log.Println("Recalculating all Glicko-2 ratings from scratch...") 70 + if err := storage.RecalculateAllGlicko2Ratings(); err != nil { 71 + log.Fatalf("Failed to recalculate ratings: %v", err) 72 + } 73 + log.Println("✓ Ratings recalculated successfully") 74 + return 75 + } 76 + } 64 77 65 78 server.InitSSE() 66 79 server.SetConfig(cfg.AdminPasscode, cfg.ExternalURL)
+9 -8
internal/runner/runner.go
··· 329 329 map[int]string{newSub.ID: newSub.Username, opponent.ID: opponent.Username}[winnerID]) 330 330 } 331 331 332 - matchID, err := storage.AddMatch(newSub.ID, opponent.ID, winnerID, player1Wins, player2Wins, avgMoves, avgMoves) 332 + _, err := storage.AddMatch(newSub.ID, opponent.ID, winnerID, player1Wins, player2Wins, avgMoves, avgMoves) 333 333 if err != nil { 334 334 log.Printf("Failed to store match result: %v", err) 335 - } else { 336 - if err := storage.UpdateGlicko2Ratings(newSub.ID, opponent.ID, player1Wins, player2Wins); err != nil { 337 - log.Printf("Glicko-2 update failed: %v", err) 338 - } else { 339 - recordRatingSnapshot(newSub.ID, int(matchID)) 340 - recordRatingSnapshot(opponent.ID, int(matchID)) 341 - } 342 335 } 343 336 } 344 337 345 338 log.Printf("✓ Round-robin complete for %s (%d matches)", newSub.Username, totalMatches) 339 + 340 + // Update Glicko-2 ratings using proper rating periods (batch all matches together) 341 + log.Printf("Updating Glicko-2 ratings (proper rating period)...") 342 + if err := storage.RecalculateAllGlicko2Ratings(); err != nil { 343 + log.Printf("Failed to update Glicko-2 ratings: %v", err) 344 + } else { 345 + log.Printf("✓ Glicko-2 ratings updated") 346 + } 346 347 } 347 348 348 349 func recordRatingSnapshot(submissionID, matchID int) {
+105 -3
internal/storage/database.go
··· 191 191 192 192 func GetLeaderboard(limit int) ([]LeaderboardEntry, error) { 193 193 // Get submissions with matches 194 + // Rankings use Glicko-2 with proper rating periods: 195 + // - All round-robin matches are batched together before rating updates 196 + // - This prevents last-submitter bias from path-dependent rating changes 194 197 query := ` 195 198 SELECT 196 199 s.username, ··· 216 219 350.0 as rd, 217 220 0 as total_wins, 218 221 0 as total_losses, 219 - 0.0 as avg_moves, 222 + 999.0 as avg_moves, 220 223 s.upload_time as last_played, 221 224 1 as is_pending, 222 225 0 as is_broken ··· 234 237 0 as rd, 235 238 0 as total_wins, 236 239 0 as total_losses, 237 - 0.0 as avg_moves, 240 + 999.0 as avg_moves, 238 241 s.upload_time as last_played, 239 242 0 as is_pending, 240 243 1 as is_broken 241 244 FROM submissions s 242 245 WHERE s.is_active = 1 AND s.status = 'compilation_failed' 243 246 244 - ORDER BY is_broken ASC, is_pending ASC, rating DESC, total_wins DESC 247 + ORDER BY is_broken ASC, is_pending ASC, rating DESC, total_wins DESC, avg_moves ASC 245 248 LIMIT ? 246 249 ` 247 250 ··· 732 735 ) 733 736 return err 734 737 } 738 + 739 + // RecalculateAllGlicko2Ratings recalculates all Glicko-2 ratings from scratch 740 + // using proper rating periods where all matches for a player are batched together 741 + func RecalculateAllGlicko2Ratings() error { 742 + // Reset all active submissions to initial ratings 743 + _, err := DB.Exec(` 744 + UPDATE submissions 745 + SET glicko_rating = 1500.0, glicko_rd = 350.0, glicko_volatility = 0.06 746 + WHERE is_active = 1 AND status = 'completed' 747 + `) 748 + if err != nil { 749 + return err 750 + } 751 + 752 + // Get all active player IDs 753 + var playerIDs []int 754 + rows, err := DB.Query("SELECT id FROM submissions WHERE is_active = 1 AND status = 'completed'") 755 + if err != nil { 756 + return err 757 + } 758 + for rows.Next() { 759 + var id int 760 + if err := rows.Scan(&id); err != nil { 761 + return err 762 + } 763 + playerIDs = append(playerIDs, id) 764 + } 765 + rows.Close() 766 + 767 + // For each player, collect ALL their match results and update once (proper rating period) 768 + for _, playerID := range playerIDs { 769 + // Get player's current rating 770 + var rating, rd, volatility float64 771 + err := DB.QueryRow( 772 + "SELECT glicko_rating, glicko_rd, glicko_volatility FROM submissions WHERE id = ?", 773 + playerID, 774 + ).Scan(&rating, &rd, &volatility) 775 + if err != nil { 776 + continue 777 + } 778 + 779 + // Collect ALL match results for this player in this rating period 780 + var results []Glicko2Result 781 + 782 + rows, err := DB.Query(` 783 + SELECT 784 + CASE WHEN player1_id = ? THEN player2_id ELSE player1_id END as opponent_id, 785 + CASE WHEN player1_id = ? THEN player1_wins ELSE player2_wins END as my_wins, 786 + CASE WHEN player1_id = ? THEN player2_wins ELSE player1_wins END as opponent_wins 787 + FROM matches 788 + WHERE (player1_id = ? OR player2_id = ?) AND is_valid = 1 789 + ORDER BY timestamp ASC 790 + `, playerID, playerID, playerID, playerID, playerID) 791 + 792 + if err != nil { 793 + continue 794 + } 795 + 796 + for rows.Next() { 797 + var opponentID, myWins, opponentWins int 798 + if err := rows.Scan(&opponentID, &myWins, &opponentWins); err != nil { 799 + continue 800 + } 801 + 802 + // Get opponent's rating at the START of this rating period (not current) 803 + var oppRating, oppRD float64 804 + err := DB.QueryRow( 805 + "SELECT glicko_rating, glicko_rd FROM submissions WHERE id = ?", 806 + opponentID, 807 + ).Scan(&oppRating, &oppRD) 808 + if err != nil { 809 + continue 810 + } 811 + 812 + totalGames := myWins + opponentWins 813 + score := float64(myWins) / float64(totalGames) 814 + 815 + results = append(results, Glicko2Result{ 816 + OpponentRating: oppRating, 817 + OpponentRD: oppRD, 818 + Score: score, 819 + }) 820 + } 821 + rows.Close() 822 + 823 + // Update this player's rating based on ALL results at once (proper rating period) 824 + if len(results) > 0 { 825 + player := Glicko2Player{Rating: rating, RD: rd, Volatility: volatility} 826 + newPlayer := updateGlicko2(player, results) 827 + 828 + DB.Exec( 829 + "UPDATE submissions SET glicko_rating = ?, glicko_rd = ?, glicko_volatility = ? WHERE id = ?", 830 + newPlayer.Rating, newPlayer.RD, newPlayer.Volatility, playerID, 831 + ) 832 + } 833 + } 834 + 835 + return nil 836 + }