Back to homepage

Mancala AI¶

Introduction¶

I play a lot of mancala with my girlfriend over text. Unfortunately (for me), she is a lot better than I am, and I usually lose. I decided the best option for me would be to make an AI to play for me.

In lieu of using player game data and training a model off of that, I decided I wanted to try to train the model without real player data, which I would come to find out provides a significant challenge.

import numpy as np
import random
from joblib import Parallel, delayed
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from termcolor import colored
import torch.autograd.profiler as profiler
import time

Building the Game¶

The first step is to build the game. What is nice about Mancala is that it is a perfect information game, so all of the information needed about the game is included in the game state.

I set up classes for the board and the game which regulate the rules, and store all the information about the game state. I created a state() function to return the information that the model will be trained off of.

class InvalidMove(Exception):
    pass

# Setup Mancala board
class Board:
    def __init__(self):
        self.stores = np.full((1,2), 0)
        self.board = np.full((2,6), 4)

    def __str__(self):
        board_string = np.array2string(self.board, separator=' ', prefix=' ', suffix='').replace('[', '').replace(']', '')
        board_string = str(self.stores[0][0]) + " " + board_string + " " + str(self.stores[0][1]) 
        return board_string
    
    def copy(self):
        board_copy = Board()
        board_copy.board = np.copy(self.board)
        board_copy.stores = np.copy(self.stores)
        return board_copy
    
    def move(self, player, pit):
        stones = self.board[player][pit] # number of stones to be place
        self.board[player][pit] = 0 # reset that pit
        direction = (-1)**(player+1) # define the direction the stones will move in.
        row = player # define the side of the board which the stones are currently being placed on
        col = pit # define the current pit

        # Stone placing loop
        while (stones > 0):
            stones -= 1
            col += direction
            # check if we are in the end and need to reverse direction
            if col == -1 and player == 0:
                row = 1
                direction *= -1
                self.stores[0][player] += 1
                if stones == 0: # last stone placed in store, player will get another turn
                    return True
            elif col == -1 and player == 1:
                row = 1
                direction *= -1
                col = 0
                self.board[row][col] += 1
                if stones == 0 and self.board[1][0] == 1 and self.board[0][0] > 0: #capture case
                    self.stores[0][player] += 1 + self.board[0][0]
                    self.board[0][0] = 0
                    self.board[1][0] = 0
            elif col == 6 and player == 1:
                row = 0
                direction *= -1
                self.stores[0][player] += 1
                if stones == 0: # last stone placed in store, player will get another turn
                    return True
            elif col == 6 and player == 0:
                row = 0
                direction *= -1
                col = 5
                self.board[row][col] += 1
                if stones == 0 and self.board[0][5] == 1 and self.board[1][5] > 0: #capture case
                    self.stores[0][player] += 1 + self.board[1][5]
                    self.board[1][5] = 0
                    self.board[0][5] = 0
            else:
                # capture rule
                opposite_row = 0 if row == 1 else 1
                if row == player and stones == 0 and self.board[row][col] == 0 and self.board[opposite_row][col] > 0:
                    self.stores[0][player] += 1 + self.board[opposite_row][col]
                    self.board[opposite_row][col] = 0
                else: # normal case
                    self.board[row][col] += 1
        return False


# Mancala Game Object
class Game:
    def __init__(self):
        self.board = Board()
        self.turn = 0
        self.other = 1

    def __str__(self):
        board_string = ""
        rev_board = self.board.board[::-1, ::-1]
        board_string = np.array2string(rev_board, separator=' ', prefix=' ', suffix='').replace('[', '').replace(']', '')
        board_string = str(self.board.stores[0][1]) + " " + board_string + " " + str(self.board.stores[0][0])
        return board_string + "  Player " + str(self.turn)
    
    def state(self, prev_board_state, turn_made, move_made):
        other_player = 0 if turn_made == 1 else 1
        score_change = self.board.stores[0][turn_made] - prev_board_state.stores[0][turn_made]
        this_player_state = prev_board_state.board[turn_made][::-1] if turn_made == 0 else prev_board_state.board[turn_made]
        other_player_state = prev_board_state.board[other_player][::-1] if turn_made == 0 else prev_board_state.board[other_player]
        move_again = 0 if turn_made != self.turn else 1

        # Look at current board state and determine what moves opponent can make
        opponent_best_move = 0
        for index, value in np.ndenumerate(self.board.board[other_player]):
            if value > 0:
                future_board = self.board.copy()
                opponent_move_again = 1 if future_board.move(other_player, index[0]) else 0
                possible_score = future_board.stores[0][other_player] - self.board.stores[0][other_player] + opponent_move_again
                opponent_best_move = possible_score if possible_score > opponent_best_move else opponent_best_move
                
        return this_player_state.tolist() + other_player_state.tolist() + [move_made, move_again, opponent_best_move, score_change]
    
    def move(self, move_spot):
        # illegal move
        if self.board.board[self.turn][move_spot] == 0:
            raise InvalidMove("Invalid Move")

        prev_board_state = self.board.copy()
        add_turn = self.board.move(self.turn, move_spot)
        turn_made = self.turn

        # check if game is over
        player_0_empty = True
        player_1_empty = True
        for col in range(0,6):
            if self.board.board[0][col] > 0:
                player_0_empty = False
            if self.board.board[1][col] > 0:
                player_1_empty = False
        # game is over if either player has no stones to move
        if player_0_empty or player_1_empty:
            # collect players' remaining stones to their store
            for col in range(0,6):
                self.board.stores[0][0] += self.board.board[0][col]
                self.board.board[0][col] = 0
                self.board.stores[0][1] += self.board.board[1][col]
                self.board.board[1][col] = 0
            return False, self.state(prev_board_state, turn_made, move_spot)
        
        if not add_turn: # if we are not giving the player another turn, change the player who is going
            self.turn = 0 if self.turn == 1 else 1
            self.other = 1 if self.turn == 1 else 0
        return True, self.state(prev_board_state, turn_made, move_spot) # return game state

Training the Model¶

I used PyTorch to train the model. My first methodology was to generate a set of random games. From these games I attempted to train the model. I figured out after awhile that I was going to need a massive amount of data to train a model this way, so I changed to playing the game with the model, then training after each game. This evolutionary approach I thought was a more sound methodology.

As you can see I did not train for a large number of games. Like I mention below, I did this project on my laptop, so I was computationally limited, which definitely hamstringed my model.

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# I did this project on my laptop, so we are just going to use the CPU since the GPU is not really worthy of use.
device = 'cpu'

# Define the policy
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.state_space = 12
        self.action_space = 6
        self.hidden1 = 48
        self.hidden2 = 32
        self.hidden3 = 32
        self.l1 = nn.Linear(self.state_space, self.hidden1, bias=False)
        self.l2 = nn.Linear(self.hidden1, self.hidden2, bias=False)
        self.l3 = nn.Linear(self.hidden2, self.hidden3, bias=False)
        self.l4 = nn.Linear(self.hidden3, self.action_space, bias=False)

    def forward(self, x):  
        model = torch.nn.Sequential(
            self.l1,
            nn.ReLU(),
            self.l2,
            nn.ReLU(),
            self.l3,
            nn.ReLU(),
            self.l4,
            nn.Softmax(dim=-1)
        )
        return model(x)

def normalize(tensor):
    return (tensor - tensor.mean()) / (tensor.std() + 1e-10)

# Initialize policy and optimizer
policy = Policy()
# Training loop
print(f"Running using {device}")
policy = policy.to(device)
optimizer = optim.AdamW(policy.parameters(), lr=0.001)

n_games = 22000 # 22 is my favorite number
start_time = time.time()
for epoch in range(n_games):

    # Play a game
    game = Game()
    still_playing = True
    # Data for both players
    states = [[],[]]
    actions = [[],[]]
    rewards = [[],[]]
    while still_playing:
        player = game.turn
        curr_state = torch.from_numpy(np.append(game.board.board[game.turn], game.board.board[game.other])).float().to(device)
        probs = policy(curr_state)
        # Sort the moves by their probabilities
        sorted_moves = torch.argsort(probs, descending=True)
        for move in sorted_moves:
            try:
                # Try to make the move
                still_playing, move_state = game.move(move.item())
                states[player].append(move_state[:12])
                actions[player].append(move_state[-4])
                rewards[player].append(move_state[-1] + move_state[-3] - move_state[-2]*4)
                break
            except InvalidMove:
                continue
    
    # Boost rewards for the winner
    if game.board.stores[0][0] > game.board.stores[0][1]:
        rewards[0] = [(rewards[0][i] + 25) for i in range(len(rewards[0]))]
    elif game.board.stores[0][1] > game.board.stores[0][0]:
        rewards[1] = [(rewards[1][i] + 25) for i in range(len(rewards[1]))]

    # calculate loss and step for each player
    for player in [0,1]:
        player_states = torch.tensor(states[player]).float()
        player_actions = torch.tensor(actions[player]).int()
        player_rewards = torch.tensor(rewards[player]).float()
        player_rewards = normalize(player_rewards)

        probs = policy(player_states)
        losses = -torch.log(probs[range(len(player_actions)), player_actions] + 1e-10) * player_rewards
        loss = losses.mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch/n_games)*100%1 == 0 and epoch > 0:
        # calculate the percentage and ETA
        percentage = epoch / n_games * 100
        elapsed = time.time() - start_time # get the elapsed time
        eta = elapsed / (epoch / n_games) * (1 - epoch / n_games) # estimate the remaining time
        # convert the elapsed and ETA to hours, minutes, and seconds
        elapsed_h, elapsed_m = divmod(elapsed, 3600)
        elapsed_m, elapsed_s = divmod(elapsed_m, 60)
        eta_h, eta_m = divmod(eta, 3600)
        eta_m, eta_s = divmod(eta_m, 60)
        # print the progress update
        print(f"\r| {percentage:.2f}% Complete | Elapsed: {elapsed_h:.0f}:{elapsed_m:02.0f}:{elapsed_s:02.0f} | ETA: {eta_h:.0f}:{eta_m:02.0f}:{eta_s:02.0f} |", end="")
Running using cpu
| 99.00% Complete | Elapsed: 0:03:33 | ETA: 0:00:02 |

Conclusion and Playing the Game¶

Since this was just a for-fun project I evaluated the performance of my model by playing it. Its... not the best. Like I mentioned previously, I was computationally limited. Generally, learning rates for evolutionary alogrithms like this tend to be slower at the begginning, but, I decided that it was not worth it to spend for the computational resources to develop this model further.

My main goal with this project was to try something different than I normally do, and instead of working with real data, challenge myself by having the create the data to train the model. In the future if I revisit this project, I will try to get my hands on real player data to train a model with.

game = Game()
still_playing = True
while still_playing:
    while game.turn == 0 and still_playing:
        print(game)
        while True:
            player_move = int(input("Move [0-5]:"))
            if player_move == 6:
                continue
            try:
                still_playing, _ = game.move(player_move)
            except InvalidMove:
                continue
            break

    # Feed the state to the model
    while game.turn == 1 and still_playing:
        curr_state = torch.from_numpy(np.append(game.board.board[1], game.board.board[0])).float().to(device)
        probs = policy(curr_state)
        [print(colored(str(round(float(prob), 3)), "magenta"), end=" ") for prob in reversed(probs)]
        print("\n" + colored(game, "light_magenta"))
        

        # Sort the moves by their probabilities
        sorted_moves = torch.argsort(probs, descending=True)

        for move in sorted_moves:
            try:
                # Try to make the move
                still_playing, _ = game.move(move.item())
                break
            except InvalidMove:
                continue
print(game)
print("Game Over.")
0 4 4 4 4 4 4
  4 4 4 4 4 4 0  Player 0
0 4 4 4 4 4 4
  4 4 0 5 5 5 1  Player 0
0.0 0.0 0.993 0.007 0.0 0.0 
0 4 4 4 4 4 4
  4 0 1 6 6 6 1  Player 1
1 5 5 0 4 4 4
  5 0 1 6 6 6 1  Player 0
0.732 0.268 0.0 0.0 0.0 0.0 
1 5 5 0 4 4 4
  0 1 2 7 7 7 1  Player 1
2 0 5 0 4 4 4
  1 2 3 8 7 7 1  Player 0
0.0 0.0 0.0 1.0 0.0 0.0 
2 0 6 1 5 5 5
  1 2 3 0 8 8 2  Player 1
3 1 7 2 0 5 5
  2 2 3 0 8 8 2  Player 0
0.0 0.016 0.0 0.0 0.984 0.0 
3 1 7 2 0 5 5
  2 0 4 1 8 8 2  Player 1
0.554 0.446 0.0 0.0 0.0 0.0 
4 2 8 3 1 0 5
  2 0 4 1 8 8 2  Player 1
5 0 8 3 1 0 5
  3 0 4 1 8 8 2  Player 0
5 0 8 3 1 0 5
  3 0 0 2 9 9 3  Player 0
0.989 0.01 0.0 0.001 0.0 0.0 
5  0  8  3  1  0  5
   3  0  0  0 10 10 3  Player 1
6  1  0  3  1  0  5
   4  1  1  1 11 11 3  Player 0
1.0 0.0 0.0 0.0 0.0 0.0 
6  2  1  4  2  1  6
   5  2  2  2 11  0 4  Player 1
7  0  1  4  2  1  6
   6  2  2  2 11  0 4  Player 0
0.0 0.0 1.0 0.0 0.0 0.0 
7 1 2 5 3 2 7
  7 3 3 2 0 1 5  Player 1
8 2 3 0 3 2 7
  8 4 3 2 0 1 5  Player 0
8 2 3 0 3 2 7
  8 4 3 2 0 0 6  Player 0
0.0 0.0 0.0 0.0 1.0 0.0 
8 2 3 0 3 2 0
  8 0 4 3 1 0 14  Player 1
13 2 3 0 4 0 0
  8 0 0 3 1 0 14  Player 0
13 2 3 0 4 0 0
  8 0 0 0 2 1 15  Player 0
13 2 3 0 4 0 0
  8 0 0 0 2 0 16  Player 0
13 2 3 0 4 0 0
  8 0 0 0 0 1 17  Player 0
13 2 3 0 4 0 0
  8 0 0 0 0 0 18  Player 0
0.0 0.523 0.001 0.222 0.254 0.0 
13 2 3 0 4 1 1
  0 1 1 1 1 1 19  Player 1
14 3 0 0 4 1 1
  1 1 1 1 1 1 19  Player 0
14 3 0 0 4 1 1
  1 1 1 1 1 0 20  Player 0
0.0 0.0 0.108 0.892 0.0 0.0 
14 3 0 0 4 1 0
  1 1 1 1 0 0 22  Player 1
0.181 0.802 0.009 0.008 0.0 0.0 
15 4 1 1 0 1 0
  1 1 1 1 0 0 22  Player 1
15 5 0 1 0 1 0
  1 1 1 1 0 0 22  Player 0
0.914 0.074 0.003 0.008 0.0 0.0 
15 5 0 1 0 0 0
  1 1 1 0 0 0 24  Player 1
16 0 0 1 0 0 0
  2 2 2 1 0 0 24  Player 0
0.165 0.023 0.123 0.096 0.003 0.59 
16 0 0 1 0 0 0
  0 3 3 1 0 0 24  Player 1
20 0 0 0 0 0 0
  0 0 0 0 0 0 28  Player 1
Game Over.

As we can see, the model did not beat me, oh well.