import numpy as np
import random
from joblib import Parallel, delayed
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from termcolor import colored
import torch.autograd.profiler as profiler
import time


class InvalidMove(Exception):
    pass

# Setup Mancala board
class Board:
    def __init__(self):
        self.stores = np.full((1,2), 0)
        self.board = np.full((2,6), 4)

    def __str__(self):
        board_string = np.array2string(self.board, separator=' ', prefix=' ', suffix='').replace('[', '').replace(']', '')
        board_string = str(self.stores[0][0]) + " " + board_string + " " + str(self.stores[0][1]) 
        return board_string
    
    def copy(self):
        board_copy = Board()
        board_copy.board = np.copy(self.board)
        board_copy.stores = np.copy(self.stores)
        return board_copy
    
    def move(self, player, pit):
        stones = self.board[player][pit] # number of stones to be place
        self.board[player][pit] = 0 # reset that pit
        direction = (-1)**(player+1) # define the direction the stones will move in.
        row = player # define the side of the board which the stones are currently being placed on
        col = pit # define the current pit

        # Stone placing loop
        while (stones > 0):
            stones -= 1
            col += direction
            # check if we are in the end and need to reverse direction
            if col == -1 and player == 0:
                row = 1
                direction *= -1
                self.stores[0][player] += 1
                if stones == 0: # last stone placed in store, player will get another turn
                    return True
            elif col == -1 and player == 1:
                row = 1
                direction *= -1
                col = 0
                self.board[row][col] += 1
                if stones == 0 and self.board[1][0] == 1 and self.board[0][0] > 0: #capture case
                    self.stores[0][player] += 1 + self.board[0][0]
                    self.board[0][0] = 0
                    self.board[1][0] = 0
            elif col == 6 and player == 1:
                row = 0
                direction *= -1
                self.stores[0][player] += 1
                if stones == 0: # last stone placed in store, player will get another turn
                    return True
            elif col == 6 and player == 0:
                row = 0
                direction *= -1
                col = 5
                self.board[row][col] += 1
                if stones == 0 and self.board[0][5] == 1 and self.board[1][5] > 0: #capture case
                    self.stores[0][player] += 1 + self.board[1][5]
                    self.board[1][5] = 0
                    self.board[0][5] = 0
            else:
                # capture rule
                opposite_row = 0 if row == 1 else 1
                if row == player and stones == 0 and self.board[row][col] == 0 and self.board[opposite_row][col] > 0:
                    self.stores[0][player] += 1 + self.board[opposite_row][col]
                    self.board[opposite_row][col] = 0
                else: # normal case
                    self.board[row][col] += 1
        return False


# Mancala Game Object
class Game:
    def __init__(self):
        self.board = Board()
        self.turn = 0
        self.other = 1

    def __str__(self):
        board_string = ""
        rev_board = self.board.board[::-1, ::-1]
        board_string = np.array2string(rev_board, separator=' ', prefix=' ', suffix='').replace('[', '').replace(']', '')
        board_string = str(self.board.stores[0][1]) + " " + board_string + " " + str(self.board.stores[0][0])
        return board_string + "  Player " + str(self.turn)
    
    def state(self, prev_board_state, turn_made, move_made):
        other_player = 0 if turn_made == 1 else 1
        score_change = self.board.stores[0][turn_made] - prev_board_state.stores[0][turn_made]
        this_player_state = prev_board_state.board[turn_made][::-1] if turn_made == 0 else prev_board_state.board[turn_made]
        other_player_state = prev_board_state.board[other_player][::-1] if turn_made == 0 else prev_board_state.board[other_player]
        move_again = 0 if turn_made != self.turn else 1

        # Look at current board state and determine what moves opponent can make
        opponent_best_move = 0
        for index, value in np.ndenumerate(self.board.board[other_player]):
            if value > 0:
                future_board = self.board.copy()
                opponent_move_again = 1 if future_board.move(other_player, index[0]) else 0
                possible_score = future_board.stores[0][other_player] - self.board.stores[0][other_player] + opponent_move_again
                opponent_best_move = possible_score if possible_score > opponent_best_move else opponent_best_move
                
        return this_player_state.tolist() + other_player_state.tolist() + [move_made, move_again, opponent_best_move, score_change]
    
    def move(self, move_spot):
        # illegal move
        if self.board.board[self.turn][move_spot] == 0:
            raise InvalidMove("Invalid Move")

        prev_board_state = self.board.copy()
        add_turn = self.board.move(self.turn, move_spot)
        turn_made = self.turn

        # check if game is over
        player_0_empty = True
        player_1_empty = True
        for col in range(0,6):
            if self.board.board[0][col] > 0:
                player_0_empty = False
            if self.board.board[1][col] > 0:
                player_1_empty = False
        # game is over if either player has no stones to move
        if player_0_empty or player_1_empty:
            # collect players' remaining stones to their store
            for col in range(0,6):
                self.board.stores[0][0] += self.board.board[0][col]
                self.board.board[0][col] = 0
                self.board.stores[0][1] += self.board.board[1][col]
                self.board.board[1][col] = 0
            return False, self.state(prev_board_state, turn_made, move_spot)
        
        if not add_turn: # if we are not giving the player another turn, change the player who is going
            self.turn = 0 if self.turn == 1 else 1
            self.other = 1 if self.turn == 1 else 0
        return True, self.state(prev_board_state, turn_made, move_spot) # return game state


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# I did this project on my laptop, so we are just going to use the CPU since the GPU is not really worthy of use.
device = 'cpu'

# Define the policy
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.state_space = 12
        self.action_space = 6
        self.hidden1 = 48
        self.hidden2 = 32
        self.hidden3 = 32
        self.l1 = nn.Linear(self.state_space, self.hidden1, bias=False)
        self.l2 = nn.Linear(self.hidden1, self.hidden2, bias=False)
        self.l3 = nn.Linear(self.hidden2, self.hidden3, bias=False)
        self.l4 = nn.Linear(self.hidden3, self.action_space, bias=False)

    def forward(self, x):  
        model = torch.nn.Sequential(
            self.l1,
            nn.ReLU(),
            self.l2,
            nn.ReLU(),
            self.l3,
            nn.ReLU(),
            self.l4,
            nn.Softmax(dim=-1)
        )
        return model(x)

def normalize(tensor):
    return (tensor - tensor.mean()) / (tensor.std() + 1e-10)

# Initialize policy and optimizer
policy = Policy()
# Training loop
print(f"Running using {device}")
policy = policy.to(device)
optimizer = optim.AdamW(policy.parameters(), lr=0.001)

n_games = 22000 # 22 is my favorite number
start_time = time.time()
for epoch in range(n_games):

    # Play a game
    game = Game()
    still_playing = True
    # Data for both players
    states = [[],[]]
    actions = [[],[]]
    rewards = [[],[]]
    while still_playing:
        player = game.turn
        curr_state = torch.from_numpy(np.append(game.board.board[game.turn], game.board.board[game.other])).float().to(device)
        probs = policy(curr_state)
        # Sort the moves by their probabilities
        sorted_moves = torch.argsort(probs, descending=True)
        for move in sorted_moves:
            try:
                # Try to make the move
                still_playing, move_state = game.move(move.item())
                states[player].append(move_state[:12])
                actions[player].append(move_state[-4])
                rewards[player].append(move_state[-1] + move_state[-3] - move_state[-2]*4)
                break
            except InvalidMove:
                continue
    
    # Boost rewards for the winner
    if game.board.stores[0][0] > game.board.stores[0][1]:
        rewards[0] = [(rewards[0][i] + 25) for i in range(len(rewards[0]))]
    elif game.board.stores[0][1] > game.board.stores[0][0]:
        rewards[1] = [(rewards[1][i] + 25) for i in range(len(rewards[1]))]

    # calculate loss and step for each player
    for player in [0,1]:
        player_states = torch.tensor(states[player]).float()
        player_actions = torch.tensor(actions[player]).int()
        player_rewards = torch.tensor(rewards[player]).float()
        player_rewards = normalize(player_rewards)

        probs = policy(player_states)
        losses = -torch.log(probs[range(len(player_actions)), player_actions] + 1e-10) * player_rewards
        loss = losses.mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch/n_games)*100%1 == 0 and epoch > 0:
        # calculate the percentage and ETA
        percentage = epoch / n_games * 100
        elapsed = time.time() - start_time # get the elapsed time
        eta = elapsed / (epoch / n_games) * (1 - epoch / n_games) # estimate the remaining time
        # convert the elapsed and ETA to hours, minutes, and seconds
        elapsed_h, elapsed_m = divmod(elapsed, 3600)
        elapsed_m, elapsed_s = divmod(elapsed_m, 60)
        eta_h, eta_m = divmod(eta, 3600)
        eta_m, eta_s = divmod(eta_m, 60)
        # print the progress update
        print(f"\r| {percentage:.2f}% Complete | Elapsed: {elapsed_h:.0f}:{elapsed_m:02.0f}:{elapsed_s:02.0f} | ETA: {eta_h:.0f}:{eta_m:02.0f}:{eta_s:02.0f} |", end="")

Running using cpu
| 99.00% Complete | Elapsed: 0:03:33 | ETA: 0:00:02 |


game = Game()
still_playing = True
while still_playing:
    while game.turn == 0 and still_playing:
        print(game)
        while True:
            player_move = int(input("Move [0-5]:"))
            if player_move == 6:
                continue
            try:
                still_playing, _ = game.move(player_move)
            except InvalidMove:
                continue
            break

    # Feed the state to the model
    while game.turn == 1 and still_playing:
        curr_state = torch.from_numpy(np.append(game.board.board[1], game.board.board[0])).float().to(device)
        probs = policy(curr_state)
        [print(colored(str(round(float(prob), 3)), "magenta"), end=" ") for prob in reversed(probs)]
        print("\n" + colored(game, "light_magenta"))
        

        # Sort the moves by their probabilities
        sorted_moves = torch.argsort(probs, descending=True)

        for move in sorted_moves:
            try:
                # Try to make the move
                still_playing, _ = game.move(move.item())
                break
            except InvalidMove:
                continue
print(game)
print("Game Over.")

0 4 4 4 4 4 4
  4 4 4 4 4 4 0  Player 0
0 4 4 4 4 4 4
  4 4 0 5 5 5 1  Player 0
0.0 0.0 0.993 0.007 0.0 0.0 
0 4 4 4 4 4 4
  4 0 1 6 6 6 1  Player 1
1 5 5 0 4 4 4
  5 0 1 6 6 6 1  Player 0
0.732 0.268 0.0 0.0 0.0 0.0 
1 5 5 0 4 4 4
  0 1 2 7 7 7 1  Player 1
2 0 5 0 4 4 4
  1 2 3 8 7 7 1  Player 0
0.0 0.0 0.0 1.0 0.0 0.0 
2 0 6 1 5 5 5
  1 2 3 0 8 8 2  Player 1
3 1 7 2 0 5 5
  2 2 3 0 8 8 2  Player 0
0.0 0.016 0.0 0.0 0.984 0.0 
3 1 7 2 0 5 5
  2 0 4 1 8 8 2  Player 1
0.554 0.446 0.0 0.0 0.0 0.0 
4 2 8 3 1 0 5
  2 0 4 1 8 8 2  Player 1
5 0 8 3 1 0 5
  3 0 4 1 8 8 2  Player 0
5 0 8 3 1 0 5
  3 0 0 2 9 9 3  Player 0
0.989 0.01 0.0 0.001 0.0 0.0 
5  0  8  3  1  0  5
   3  0  0  0 10 10 3  Player 1
6  1  0  3  1  0  5
   4  1  1  1 11 11 3  Player 0
1.0 0.0 0.0 0.0 0.0 0.0 
6  2  1  4  2  1  6
   5  2  2  2 11  0 4  Player 1
7  0  1  4  2  1  6
   6  2  2  2 11  0 4  Player 0
0.0 0.0 1.0 0.0 0.0 0.0 
7 1 2 5 3 2 7
  7 3 3 2 0 1 5  Player 1
8 2 3 0 3 2 7
  8 4 3 2 0 1 5  Player 0
8 2 3 0 3 2 7
  8 4 3 2 0 0 6  Player 0
0.0 0.0 0.0 0.0 1.0 0.0 
8 2 3 0 3 2 0
  8 0 4 3 1 0 14  Player 1
13 2 3 0 4 0 0
  8 0 0 3 1 0 14  Player 0
13 2 3 0 4 0 0
  8 0 0 0 2 1 15  Player 0
13 2 3 0 4 0 0
  8 0 0 0 2 0 16  Player 0
13 2 3 0 4 0 0
  8 0 0 0 0 1 17  Player 0
13 2 3 0 4 0 0
  8 0 0 0 0 0 18  Player 0
0.0 0.523 0.001 0.222 0.254 0.0 
13 2 3 0 4 1 1
  0 1 1 1 1 1 19  Player 1
14 3 0 0 4 1 1
  1 1 1 1 1 1 19  Player 0
14 3 0 0 4 1 1
  1 1 1 1 1 0 20  Player 0
0.0 0.0 0.108 0.892 0.0 0.0 
14 3 0 0 4 1 0
  1 1 1 1 0 0 22  Player 1
0.181 0.802 0.009 0.008 0.0 0.0 
15 4 1 1 0 1 0
  1 1 1 1 0 0 22  Player 1
15 5 0 1 0 1 0
  1 1 1 1 0 0 22  Player 0
0.914 0.074 0.003 0.008 0.0 0.0 
15 5 0 1 0 0 0
  1 1 1 0 0 0 24  Player 1
16 0 0 1 0 0 0
  2 2 2 1 0 0 24  Player 0
0.165 0.023 0.123 0.096 0.003 0.59 
16 0 0 1 0 0 0
  0 3 3 1 0 0 24  Player 1
20 0 0 0 0 0 0
  0 0 0 0 0 0 28  Player 1
Game Over.

Back to homepage

Mancala AI¶

Introduction¶

Building the Game¶

Training the Model¶

Conclusion and Playing the Game¶