I play a lot of mancala with my girlfriend over text. Unfortunately (for me), she is a lot better than I am, and I usually lose. I decided the best option for me would be to make an AI to play for me.
In lieu of using player game data and training a model off of that, I decided I wanted to try to train the model without real player data, which I would come to find out provides a significant challenge.
import numpy as np
import random
from joblib import Parallel, delayed
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from termcolor import colored
import torch.autograd.profiler as profiler
import time
The first step is to build the game. What is nice about Mancala is that it is a perfect information game, so all of the information needed about the game is included in the game state.
I set up classes for the board and the game which regulate the rules, and store all the information about the game state. I created a state()
function to return the information that the model will be trained off of.
class InvalidMove(Exception):
pass
# Setup Mancala board
class Board:
def __init__(self):
self.stores = np.full((1,2), 0)
self.board = np.full((2,6), 4)
def __str__(self):
board_string = np.array2string(self.board, separator=' ', prefix=' ', suffix='').replace('[', '').replace(']', '')
board_string = str(self.stores[0][0]) + " " + board_string + " " + str(self.stores[0][1])
return board_string
def copy(self):
board_copy = Board()
board_copy.board = np.copy(self.board)
board_copy.stores = np.copy(self.stores)
return board_copy
def move(self, player, pit):
stones = self.board[player][pit] # number of stones to be place
self.board[player][pit] = 0 # reset that pit
direction = (-1)**(player+1) # define the direction the stones will move in.
row = player # define the side of the board which the stones are currently being placed on
col = pit # define the current pit
# Stone placing loop
while (stones > 0):
stones -= 1
col += direction
# check if we are in the end and need to reverse direction
if col == -1 and player == 0:
row = 1
direction *= -1
self.stores[0][player] += 1
if stones == 0: # last stone placed in store, player will get another turn
return True
elif col == -1 and player == 1:
row = 1
direction *= -1
col = 0
self.board[row][col] += 1
if stones == 0 and self.board[1][0] == 1 and self.board[0][0] > 0: #capture case
self.stores[0][player] += 1 + self.board[0][0]
self.board[0][0] = 0
self.board[1][0] = 0
elif col == 6 and player == 1:
row = 0
direction *= -1
self.stores[0][player] += 1
if stones == 0: # last stone placed in store, player will get another turn
return True
elif col == 6 and player == 0:
row = 0
direction *= -1
col = 5
self.board[row][col] += 1
if stones == 0 and self.board[0][5] == 1 and self.board[1][5] > 0: #capture case
self.stores[0][player] += 1 + self.board[1][5]
self.board[1][5] = 0
self.board[0][5] = 0
else:
# capture rule
opposite_row = 0 if row == 1 else 1
if row == player and stones == 0 and self.board[row][col] == 0 and self.board[opposite_row][col] > 0:
self.stores[0][player] += 1 + self.board[opposite_row][col]
self.board[opposite_row][col] = 0
else: # normal case
self.board[row][col] += 1
return False
# Mancala Game Object
class Game:
def __init__(self):
self.board = Board()
self.turn = 0
self.other = 1
def __str__(self):
board_string = ""
rev_board = self.board.board[::-1, ::-1]
board_string = np.array2string(rev_board, separator=' ', prefix=' ', suffix='').replace('[', '').replace(']', '')
board_string = str(self.board.stores[0][1]) + " " + board_string + " " + str(self.board.stores[0][0])
return board_string + " Player " + str(self.turn)
def state(self, prev_board_state, turn_made, move_made):
other_player = 0 if turn_made == 1 else 1
score_change = self.board.stores[0][turn_made] - prev_board_state.stores[0][turn_made]
this_player_state = prev_board_state.board[turn_made][::-1] if turn_made == 0 else prev_board_state.board[turn_made]
other_player_state = prev_board_state.board[other_player][::-1] if turn_made == 0 else prev_board_state.board[other_player]
move_again = 0 if turn_made != self.turn else 1
# Look at current board state and determine what moves opponent can make
opponent_best_move = 0
for index, value in np.ndenumerate(self.board.board[other_player]):
if value > 0:
future_board = self.board.copy()
opponent_move_again = 1 if future_board.move(other_player, index[0]) else 0
possible_score = future_board.stores[0][other_player] - self.board.stores[0][other_player] + opponent_move_again
opponent_best_move = possible_score if possible_score > opponent_best_move else opponent_best_move
return this_player_state.tolist() + other_player_state.tolist() + [move_made, move_again, opponent_best_move, score_change]
def move(self, move_spot):
# illegal move
if self.board.board[self.turn][move_spot] == 0:
raise InvalidMove("Invalid Move")
prev_board_state = self.board.copy()
add_turn = self.board.move(self.turn, move_spot)
turn_made = self.turn
# check if game is over
player_0_empty = True
player_1_empty = True
for col in range(0,6):
if self.board.board[0][col] > 0:
player_0_empty = False
if self.board.board[1][col] > 0:
player_1_empty = False
# game is over if either player has no stones to move
if player_0_empty or player_1_empty:
# collect players' remaining stones to their store
for col in range(0,6):
self.board.stores[0][0] += self.board.board[0][col]
self.board.board[0][col] = 0
self.board.stores[0][1] += self.board.board[1][col]
self.board.board[1][col] = 0
return False, self.state(prev_board_state, turn_made, move_spot)
if not add_turn: # if we are not giving the player another turn, change the player who is going
self.turn = 0 if self.turn == 1 else 1
self.other = 1 if self.turn == 1 else 0
return True, self.state(prev_board_state, turn_made, move_spot) # return game state
I used PyTorch to train the model. My first methodology was to generate a set of random games. From these games I attempted to train the model. I figured out after awhile that I was going to need a massive amount of data to train a model this way, so I changed to playing the game with the model, then training after each game. This evolutionary approach I thought was a more sound methodology.
As you can see I did not train for a large number of games. Like I mention below, I did this project on my laptop, so I was computationally limited, which definitely hamstringed my model.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# I did this project on my laptop, so we are just going to use the CPU since the GPU is not really worthy of use.
device = 'cpu'
# Define the policy
class Policy(nn.Module):
def __init__(self):
super(Policy, self).__init__()
self.state_space = 12
self.action_space = 6
self.hidden1 = 48
self.hidden2 = 32
self.hidden3 = 32
self.l1 = nn.Linear(self.state_space, self.hidden1, bias=False)
self.l2 = nn.Linear(self.hidden1, self.hidden2, bias=False)
self.l3 = nn.Linear(self.hidden2, self.hidden3, bias=False)
self.l4 = nn.Linear(self.hidden3, self.action_space, bias=False)
def forward(self, x):
model = torch.nn.Sequential(
self.l1,
nn.ReLU(),
self.l2,
nn.ReLU(),
self.l3,
nn.ReLU(),
self.l4,
nn.Softmax(dim=-1)
)
return model(x)
def normalize(tensor):
return (tensor - tensor.mean()) / (tensor.std() + 1e-10)
# Initialize policy and optimizer
policy = Policy()
# Training loop
print(f"Running using {device}")
policy = policy.to(device)
optimizer = optim.AdamW(policy.parameters(), lr=0.001)
n_games = 22000 # 22 is my favorite number
start_time = time.time()
for epoch in range(n_games):
# Play a game
game = Game()
still_playing = True
# Data for both players
states = [[],[]]
actions = [[],[]]
rewards = [[],[]]
while still_playing:
player = game.turn
curr_state = torch.from_numpy(np.append(game.board.board[game.turn], game.board.board[game.other])).float().to(device)
probs = policy(curr_state)
# Sort the moves by their probabilities
sorted_moves = torch.argsort(probs, descending=True)
for move in sorted_moves:
try:
# Try to make the move
still_playing, move_state = game.move(move.item())
states[player].append(move_state[:12])
actions[player].append(move_state[-4])
rewards[player].append(move_state[-1] + move_state[-3] - move_state[-2]*4)
break
except InvalidMove:
continue
# Boost rewards for the winner
if game.board.stores[0][0] > game.board.stores[0][1]:
rewards[0] = [(rewards[0][i] + 25) for i in range(len(rewards[0]))]
elif game.board.stores[0][1] > game.board.stores[0][0]:
rewards[1] = [(rewards[1][i] + 25) for i in range(len(rewards[1]))]
# calculate loss and step for each player
for player in [0,1]:
player_states = torch.tensor(states[player]).float()
player_actions = torch.tensor(actions[player]).int()
player_rewards = torch.tensor(rewards[player]).float()
player_rewards = normalize(player_rewards)
probs = policy(player_states)
losses = -torch.log(probs[range(len(player_actions)), player_actions] + 1e-10) * player_rewards
loss = losses.mean()
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (epoch/n_games)*100%1 == 0 and epoch > 0:
# calculate the percentage and ETA
percentage = epoch / n_games * 100
elapsed = time.time() - start_time # get the elapsed time
eta = elapsed / (epoch / n_games) * (1 - epoch / n_games) # estimate the remaining time
# convert the elapsed and ETA to hours, minutes, and seconds
elapsed_h, elapsed_m = divmod(elapsed, 3600)
elapsed_m, elapsed_s = divmod(elapsed_m, 60)
eta_h, eta_m = divmod(eta, 3600)
eta_m, eta_s = divmod(eta_m, 60)
# print the progress update
print(f"\r| {percentage:.2f}% Complete | Elapsed: {elapsed_h:.0f}:{elapsed_m:02.0f}:{elapsed_s:02.0f} | ETA: {eta_h:.0f}:{eta_m:02.0f}:{eta_s:02.0f} |", end="")
Running using cpu | 99.00% Complete | Elapsed: 0:03:33 | ETA: 0:00:02 |
Since this was just a for-fun project I evaluated the performance of my model by playing it. Its... not the best. Like I mentioned previously, I was computationally limited. Generally, learning rates for evolutionary alogrithms like this tend to be slower at the begginning, but, I decided that it was not worth it to spend for the computational resources to develop this model further.
My main goal with this project was to try something different than I normally do, and instead of working with real data, challenge myself by having the create the data to train the model. In the future if I revisit this project, I will try to get my hands on real player data to train a model with.
game = Game()
still_playing = True
while still_playing:
while game.turn == 0 and still_playing:
print(game)
while True:
player_move = int(input("Move [0-5]:"))
if player_move == 6:
continue
try:
still_playing, _ = game.move(player_move)
except InvalidMove:
continue
break
# Feed the state to the model
while game.turn == 1 and still_playing:
curr_state = torch.from_numpy(np.append(game.board.board[1], game.board.board[0])).float().to(device)
probs = policy(curr_state)
[print(colored(str(round(float(prob), 3)), "magenta"), end=" ") for prob in reversed(probs)]
print("\n" + colored(game, "light_magenta"))
# Sort the moves by their probabilities
sorted_moves = torch.argsort(probs, descending=True)
for move in sorted_moves:
try:
# Try to make the move
still_playing, _ = game.move(move.item())
break
except InvalidMove:
continue
print(game)
print("Game Over.")
0 4 4 4 4 4 4 4 4 4 4 4 4 0 Player 0 0 4 4 4 4 4 4 4 4 0 5 5 5 1 Player 0 0.0 0.0 0.993 0.007 0.0 0.0 0 4 4 4 4 4 4 4 0 1 6 6 6 1 Player 1 1 5 5 0 4 4 4 5 0 1 6 6 6 1 Player 0 0.732 0.268 0.0 0.0 0.0 0.0 1 5 5 0 4 4 4 0 1 2 7 7 7 1 Player 1 2 0 5 0 4 4 4 1 2 3 8 7 7 1 Player 0 0.0 0.0 0.0 1.0 0.0 0.0 2 0 6 1 5 5 5 1 2 3 0 8 8 2 Player 1 3 1 7 2 0 5 5 2 2 3 0 8 8 2 Player 0 0.0 0.016 0.0 0.0 0.984 0.0 3 1 7 2 0 5 5 2 0 4 1 8 8 2 Player 1 0.554 0.446 0.0 0.0 0.0 0.0 4 2 8 3 1 0 5 2 0 4 1 8 8 2 Player 1 5 0 8 3 1 0 5 3 0 4 1 8 8 2 Player 0 5 0 8 3 1 0 5 3 0 0 2 9 9 3 Player 0 0.989 0.01 0.0 0.001 0.0 0.0 5 0 8 3 1 0 5 3 0 0 0 10 10 3 Player 1 6 1 0 3 1 0 5 4 1 1 1 11 11 3 Player 0 1.0 0.0 0.0 0.0 0.0 0.0 6 2 1 4 2 1 6 5 2 2 2 11 0 4 Player 1 7 0 1 4 2 1 6 6 2 2 2 11 0 4 Player 0 0.0 0.0 1.0 0.0 0.0 0.0 7 1 2 5 3 2 7 7 3 3 2 0 1 5 Player 1 8 2 3 0 3 2 7 8 4 3 2 0 1 5 Player 0 8 2 3 0 3 2 7 8 4 3 2 0 0 6 Player 0 0.0 0.0 0.0 0.0 1.0 0.0 8 2 3 0 3 2 0 8 0 4 3 1 0 14 Player 1 13 2 3 0 4 0 0 8 0 0 3 1 0 14 Player 0 13 2 3 0 4 0 0 8 0 0 0 2 1 15 Player 0 13 2 3 0 4 0 0 8 0 0 0 2 0 16 Player 0 13 2 3 0 4 0 0 8 0 0 0 0 1 17 Player 0 13 2 3 0 4 0 0 8 0 0 0 0 0 18 Player 0 0.0 0.523 0.001 0.222 0.254 0.0 13 2 3 0 4 1 1 0 1 1 1 1 1 19 Player 1 14 3 0 0 4 1 1 1 1 1 1 1 1 19 Player 0 14 3 0 0 4 1 1 1 1 1 1 1 0 20 Player 0 0.0 0.0 0.108 0.892 0.0 0.0 14 3 0 0 4 1 0 1 1 1 1 0 0 22 Player 1 0.181 0.802 0.009 0.008 0.0 0.0 15 4 1 1 0 1 0 1 1 1 1 0 0 22 Player 1 15 5 0 1 0 1 0 1 1 1 1 0 0 22 Player 0 0.914 0.074 0.003 0.008 0.0 0.0 15 5 0 1 0 0 0 1 1 1 0 0 0 24 Player 1 16 0 0 1 0 0 0 2 2 2 1 0 0 24 Player 0 0.165 0.023 0.123 0.096 0.003 0.59 16 0 0 1 0 0 0 0 3 3 1 0 0 24 Player 1 20 0 0 0 0 0 0 0 0 0 0 0 0 28 Player 1 Game Over.
As we can see, the model did not beat me, oh well.