Initialize Q(s, a) arbitrarily for all s, a
Repeat (for each episode):
    Initialize state s
    Repeat (for each step of episode):
        Choose action a from s using policy derived from Q (e.g., ε-greedy)
        Take action a, observe reward r, and new state s'
        Q(s, a) <- Q(s, a) + α * (r + γ * max_a' Q(s', a') - Q(s, a))
        s <- s'
    until s is terminal

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
from collections import deque
import random
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

import numpy as np
import matplotlib.pyplot as plt
import random
import copy
from tqdm import tqdm

class QLearningAgent:
    def __init__(self, states, actions, alpha=0.1, gamma=0.99, epsilon=0.1):
        self.states = states          # Liste ou ensemble des états possibles
        self.actions = actions        # Liste ou ensemble des actions possibles
        self.alpha = alpha            # Taux d'apprentissage
        self.gamma = gamma            # Facteur de décote
        self.epsilon = epsilon        # Probabilité d'exploration (politique epsilon-greedy)
        self.Q = {}                   # Table des valeurs Q initialisée à 0
        
        # Initialisation de la table Q
        for state in states:
            self.Q[state] = {}
            for action in actions:
                self.Q[state][action] = 0.0

    def act(self, state):
        """ Choix d'une action selon une politique epsilon-greedy """
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(self.actions)  # Exploration: choix aléatoire
        else:
            # Exploitation: choisir l'action avec la valeur Q maximale pour l'état actuel
            q_values = self.Q[state]
            max_value = max(q_values.values())
            # En cas de plusieurs actions ayant la même valeur maximale, choisir aléatoirement parmi elles
            return random.choice([k for k, v in q_values.items() if v == max_value])

    def update_q_value(self, state, action, reward, next_state):
        """ Met à jour la table Q en utilisant l'équation de Bellman """
        current_q = self.Q[state][action]
        # Calcul de la valeur Q max pour le nouvel état
        max_next_q = max(self.Q[next_state].values())
        # Mise à jour de la valeur Q
        self.Q[state][action] = current_q + self.alpha * (reward + self.gamma * max_next_q - current_q)

    def train(self, env, episodes):
        """ Entraînement de l'agent sur un environnement spécifié """
        for episode in tqdm(range(episodes)):
            state = env.reset()  # Réinitialisation de l'environnement
            while True:
                action = self.act(state)
                next_state, reward, done, _ = env.step(action)  # Exécute une action
                self.update_q_value(state, action, reward, next_state)
                state = next_state
                if done:
                    break

class GridEnvironment:
    def __init__(self, N, P , diff=2):
        self.grid_lines = N
        self.grid_cols = P
        self.obstacles = self.generate_couples(N, P,diff)
        self.goal = (N-1, P-1)
        self.state = (0, 0)
        self.actions = [0,1,2,3] # ["up", "down", "left", "right"]

    def generate_couples(self, N, P,diff):
        num_couples = N // diff
        couples = [(random.randint(1, N-1), random.randint(1, P-1)) for _ in range(num_couples)]
        return couples


    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        ori=copy.copy(self.state)
        x, y = self.state
        if action == 0: # "up":
            x = max(0, x - 1)
        elif action == 1: # "down":
            x = min(self.grid_lines - 1, x + 1)
        elif action == 2: # "left":
            y = max(0, y - 1)
        elif action == 3: # "right":
            y = min(self.grid_cols - 1, y + 1)

        new_state = (x, y)
        obs=False
        if new_state in self.obstacles:
            reward = -100
            done = False
            obs=True
        elif new_state == self.goal:
            reward = 100
            done = True
        else:
            reward = -1
            done = False
        if not obs:
            self.state = new_state
        return new_state, reward, done, {}

# Création de l'environnement
N=10
P=10
env = GridEnvironment(N,P)

# Exemple d'utilisation avec un agent Q-learning
states = [(x, y) for x in range(env.grid_lines) for y in range(env.grid_cols)]
actions = env.actions

agent = QLearningAgent(states, actions)
#agent.train(env, 1000)

def draw_text(ax, start, action):
    directions_name = {
        0: "up",    # Up
        1: "down",  # Down
        2: "left",  # Left
        3: "right"  # Right
    }
    directions = {
        0: (0, -0.4),   # Up
        1: (0, 0.4),  # Down
        2: (-0.4, 0),  # Left
        3: (0.4, 0)    # Right
    }
    dx, dy = directions[action]
    action_text = str(start[0]) + "," + str(start[1]) #action
    ax.text(start[1]+0.5, start[0]+0.5, action_text, ha='center', va='center', color='black', fontsize=8)
    ax.text(start[1]+0.5, start[0]+0.8, directions_name[action], ha='center', va='center', color='red', fontsize=8)
    ax.arrow(start[1]+0.5, start[0]+0.5, dx, dy, head_width=0.2, head_length=0.2, color='b')

def draw_grid_with_texts(agent, grid_lines, grid_cols):
    fig, ax = plt.subplots()
    ax.set_xlim(-0.5, grid_cols + 0.5)
    ax.set_ylim(grid_lines + 0.5, -0.5)  # Inverser l'axe des y
    ax.set_xticks(range(grid_cols+1))
    ax.set_yticks(range(grid_lines+1))
    ax.grid(which='both')

    cmap = colors.ListedColormap(['white', 'blue', 'red', 'green'])
    grid = np.zeros((grid_lines, grid_cols))
    plot = ax.imshow(grid, cmap=cmap, interpolation='none')
    
    for x in range(grid_lines):
        for y in range(grid_cols):
            action = agent.act((x, y))
            draw_text(ax, (x, y), action)
    
    plt.show()

draw_grid_with_texts(agent, N, P)

def update(frame, env, agent, plot, text):
    if frame == 0 :
        env.reset()  # Reset environment at the start of the animation
    state = env.state
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)
    
    grid = np.zeros((env.grid_lines, env.grid_cols))
    for obs in env.obstacles:
        grid[obs] = 2
    grid[env.goal] = 3
    grid[next_state] = 1  # Mark agent's position
    
    cmap = colors.ListedColormap(['white', 'blue', 'red', 'green'])  # Normal, agent, obstacles, goal
    norm = colors.BoundaryNorm([0, 1, 2, 3, 4], cmap.N)

    plot.set_data(grid)
    plot.set_cmap(cmap)
    plot.set_norm(norm)
    text.set_text(f'Frame: {frame+1} - State: {next_state} - Action: {action} - Reward: {reward}')
    
    if done or frame == 49:  # Stop after 50 frames or if done
        ani.event_source.stop()
    return [plot, text]

# Test and animate the agent
env.reset()
fig, ax = plt.subplots()
ax.set_xlim(-0.5, P + 0.5)
ax.set_ylim(N + 0.5, -0.5)  # Inverser l'axe des y
ax.set_xticks(range(N+1))
ax.set_yticks(range(P+1))
ax.grid(which='both')

plot = ax.imshow(np.zeros((env.grid_lines, env.grid_cols)), cmap='viridis', interpolation='none',extent=(0, P , N, 0 ))
text = ax.text(0.5, -0.1, '', transform=ax.transAxes, ha='center')

ani = FuncAnimation(fig, update, frames=50, fargs=(env, agent, plot, text), blit=True, repeat=False)

# Afficher l'animation dans Jupyter Lab
HTML(ani.to_jshtml())

fig, ax = plt.subplots()
ax.set_xlim(-0.5, P + 0.5)
ax.set_ylim(N + 0.5, -0.5)  # Inverser l'axe des y
ax.set_xticks(range(N+1))
ax.set_yticks(range(P+1))
ax.grid(which='both')

grid = np.zeros((env.grid_lines, env.grid_cols))
for obs in env.obstacles:
    grid[obs] = 2
grid[env.goal] = 3
grid[(0,0)] = 1  # Mark agent's position

cmap = colors.ListedColormap(['white', 'blue', 'red', 'green'])  # Normal, agent, obstacles, goal
norm = colors.BoundaryNorm([0, 1, 2, 3, 4], cmap.N)

plot = ax.imshow(grid, cmap=cmap, norm=norm, interpolation='none',extent=(0, P , N, 0 ))

plt.show()

agent.train(env, 1000)

100%|██████████| 1000/1000 [00:00<00:00, 17496.83it/s]

# Test and animate the agent
env.reset()
fig, ax = plt.subplots()
ax.set_xlim(-0.5, P + 0.5)
ax.set_ylim(N + 0.5, -0.5)  # Inverser l'axe des y
ax.set_xticks(range(N+1))
ax.set_yticks(range(P+1))
ax.grid(which='both')

plot = ax.imshow(np.zeros((env.grid_lines, env.grid_cols)), cmap='viridis', interpolation='none',extent=(0, P , N, 0 ))
text = ax.text(0.5, -0.1, '', transform=ax.transAxes, ha='center')

ani = FuncAnimation(fig, update, frames=50, fargs=(env, agent, plot, text), blit=True, repeat=False)

# Afficher l'animation dans Jupyter Lab
HTML(ani.to_jshtml())

draw_grid_with_texts(agent, N, P)

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
from matplotlib.colors import ListedColormap
import copy
from matplotlib import colors

class GridEnvironment:
    def __init__(self, N, P , diff=2):
        self.grid_lines = N
        self.grid_cols = P
        self.obstacles = self.generate_couples(N, P,diff)
        self.goal = (N-1, P-1)
        self.state = (0, 0)
        self.actions = [0,1,2,3] # ["up", "down", "left", "right"]

    def generate_couples(self, N, P,diff):
        num_couples = N // diff
        couples = [(random.randint(1, N-1), random.randint(1, P-1)) for _ in range(num_couples)]
        return couples


    def reset(self):
        self.state = (0, 0)
        return self.state

    def step(self, action):
        x, y = self.state
        if action == 0 and x > 0:  # Up
            x -= 1
        elif action == 1 and x < self.grid_lines - 1:  # Down
            x += 1
        elif action == 2 and y > 0:  # Left
            y -= 1
        elif action == 3 and y < self.grid_cols - 1:  # Right
            y += 1
        else:
            #print(f"Invalid action: {action}")
            pass
        self.state = (x, y)
        reward = -1  # Placeholder reward
        if (x, y) in self.obstacles:
            reward = -10  # Penalty for hitting an obstacle
        elif (x, y) == self.goal:
            reward = 10  # Reward for reaching the goal
        done = (x, y) == self.goal  # Episode ends when goal is reached
        #print(f"Action: {action}, New State: {self.state}, Reward: {reward}, Done: {done}")
        return self.state, reward, done, {}

class DQNNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQNNetwork, self).__init__()
        self.dense1 = nn.Linear(state_size, 32)
        self.relu1 = nn.ReLU()
        self.dense2 = nn.Linear(32, 32)
        self.relu2 = nn.ReLU()
        self.output = nn.Linear(32, action_size)

    def forward(self, x):
        x = self.relu1(self.dense1(x))
        x = self.relu2(self.dense2(x))
        x = self.output(x)
        return x

class DQNAgent:
    def __init__(self, grid_lines, grid_cols, action_size):
        self.state_size = 2  # State is now (x, y)
        self.grid_lines = grid_lines
        self.grid_cols = grid_cols
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = 1.2
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = DQNNetwork(self.state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state).unsqueeze(0)
        act_values = self.model(state)
        return act_values.max(1)[1].item()

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.FloatTensor(state).unsqueeze(0)
            next_state = torch.FloatTensor(next_state).unsqueeze(0)

            current_q_values = self.model(state)
            next_q_values = self.model(next_state).detach()

            max_next_q = torch.max(next_q_values, 1)[0]
            target_q_value = reward + (self.gamma * max_next_q * (1 - int(done)))

            target_q_values = current_q_values.clone()
            target_q_values[0, action] = target_q_value

            loss = torch.nn.functional.mse_loss(current_q_values, target_q_values)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def train(self, env, episodes, batch_size):
        for episode in tqdm(range(episodes)):
            state = env.reset()
            done = False
            total_reward = 0
            trys=0
            while not done and trys < 100 :
                trys+=1
                action = self.act(state)
                next_state, reward, done, _ = env.step(action)
                self.remember(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward
                if len(self.memory) > batch_size:
                    self.replay(batch_size)
                #print(f"State: {state}, Action: {action}, Reward: {reward}, Next state: {next_state}, Done: {done}")
            #print(f"Episode {episode + 1}/{episodes}, Reward: {total_reward}, Epsilon: {self.epsilon:.2f}")

grid_lines = 10
grid_cols = 10
action_size = 4
agent = DQNAgent(grid_lines, grid_cols, action_size)

# Entrainement avec l'environnement dummy
env = GridEnvironment(grid_lines, grid_cols)
N=10
P=10

draw_grid_with_texts(agent, N, P)

# Test and animate the agent
env.reset()
fig, ax = plt.subplots()
ax.set_xlim(-0.5, P + 0.5)
ax.set_ylim(N + 0.5, -0.5)  # Inverser l'axe des y
ax.set_xticks(range(N+1))
ax.set_yticks(range(P+1))
ax.grid(which='both')

plot = ax.imshow(np.zeros((env.grid_lines, env.grid_cols)), cmap='viridis', interpolation='none',extent=(0, P , N, 0 ))
text = ax.text(0.5, -0.1, '', transform=ax.transAxes, ha='center')

ani = FuncAnimation(fig, update, frames=50, fargs=(env, agent, plot, text), blit=True, repeat=False)

# Afficher l'animation dans Jupyter Lab
HTML(ani.to_jshtml())

# Entraînement de l'agent
episodes = 100
batch_size = 32
agent.train(env, episodes, batch_size)

100%|██████████| 100/100 [02:18<00:00,  1.39s/it]

agent.epsilon=0
draw_grid_with_texts(agent, 10, 10)

# Test and animate the agent
env.reset()
fig, ax = plt.subplots()
ax.set_xlim(-0.5, P + 0.5)
ax.set_ylim(N + 0.5, -0.5)  # Inverser l'axe des y
ax.set_xticks(range(N+1))
ax.set_yticks(range(P+1))
ax.grid(which='both')

plot = ax.imshow(np.zeros((env.grid_lines, env.grid_cols)), cmap='viridis', interpolation='none',extent=(0, P , N, 0 ))
text = ax.text(0.5, -0.1, '', transform=ax.transAxes, ha='center')

ani = FuncAnimation(fig, update, frames=50, fargs=(env, agent, plot, text), blit=True, repeat=False)

# Afficher l'animation dans Jupyter Lab
HTML(ani.to_jshtml())

import random
import copy
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib.animation import FuncAnimation
from matplotlib import animation
from IPython.display import HTML
from queue import Queue

import random

class GridEnvironment:
    def __init__(self, N, P, diff=2):
        self.grid_lines = N
        self.grid_cols = P
        self.obstacles = self.generate_couples(N, P, diff)
        self.goal = (N-1, P-1)
        self.start = (0, 0)
        self.actions = [0, 1, 2, 3, 4]  # ["unvisited", "start", "frontier", "visited", "goal"]
        self.cells = self.initialize_cells()
    
    def generate_couples(self, N, P, diff):
        num_couples = diff
        couples = []
        while len(couples) < num_couples:
            couple = (random.randint(0, N-1), random.randint(0, P-1))
            if couple not in couples and couple != (0, 0) and couple != (N-1, P-1):
                couples.append(couple)
        return couples
    
    def initialize_cells(self):
        cells = {}
        for i in range(self.grid_lines):
            for j in range(self.grid_cols):
                if (i, j) in self.obstacles:
                    cells[(i, j)] = {'state': 'obstacle', 'cost': float('inf')}
                elif (i, j) == self.start:
                    cells[(i, j)] = {'state': 'start', 'cost': 0}
                elif (i, j) == self.goal:
                    cells[(i, j)] = {'state': 'goal', 'cost': 0}
                else:
                    cells[(i, j)] = {'state': 'unvisited', 'cost': 1}
        return cells
    
    def reset(self):
        self.cells = self.initialize_cells()
        self.start = (0, 0)
        return self.cells
    
    def print_grid(self):
        for i in range(self.grid_lines):
            for j in range(self.grid_cols):
                cell = self.cells[(i, j)]
                state = cell['state'][0].upper()  # First letter of state
                print(state, end=" ")
            print()  # Newline after each row
        print("_____________________________________")
        print("")
        for i in range(self.grid_lines):
            for j in range(self.grid_cols):
                cell = self.cells[(i, j)]
                state = cell['cost']  # First letter of state
                print(str(state)+"\t", end=" ")
            print()  # Newline after each row

# Example usage:
grid_env = GridEnvironment(5, 5, 3)
grid_env.print_grid()

S U U U U 
U U O U U 
U U U U U 
U U O U U 
U U U O G 
_____________________________________

0	 1	 1	 1	 1	 
1	 1	 inf	 1	 1	 
1	 1	 1	 1	 1	 
1	 1	 inf	 1	 1	 
1	 1	 1	 inf	 0

# Création de l'environnement
N = 10
P = 10
env = GridEnvironment(N, P)

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
from queue import Queue

# Breadth First Search (BFS) Implementation
def bfs(env):
    start = env.start
    frontier = Queue()
    frontier.put(start)
    reached = set()
    reached.add(start)
    steps = []

    while not frontier.empty():
        current_step = {}
        current = frontier.get()
        current_step[current] = 'visited'
        
        if current == env.goal:
            steps.append(current_step)
            break

        # Get possible next states
        neighbors = get_neighbors(current, env)
        
        for next in neighbors:
            if next not in reached and next not in env.obstacles:
                frontier.put(next)
                reached.add(next)
                current_step[next] = 'frontier'
        
        steps.append(current_step)
    
    return steps

# Function to get neighbors of a cell
def get_neighbors(cell, env):
    x, y = cell
    neighbors = []
    if x > 0:
        neighbors.append((x-1, y))
    if x < env.grid_lines - 1:
        neighbors.append((x+1, y))
    if y > 0:
        neighbors.append((x, y-1))
    if y < env.grid_cols - 1:
        neighbors.append((x, y+1))
    return neighbors

# Animation update function
def update(frame, env, steps, plot, text):
    step = steps[frame]
    
    for i in range(env.grid_lines):
        for j in range(env.grid_cols):
            if (i, j) in step:
                env.cells[(i, j)]['state'] = step[(i, j)]

    grid = np.zeros((env.grid_lines, env.grid_cols))
    
    for i in range(env.grid_lines):
        for j in range(env.grid_cols):
            cell_state = env.cells[(i, j)]['state']
            if cell_state == 'unvisited':
                grid[i, j] = 0
            elif cell_state == 'start':
                grid[i, j] = 1
            elif cell_state == 'frontier':
                grid[i, j] = 2
            elif cell_state == 'visited':
                grid[i, j] = 3
            elif cell_state == 'goal':
                grid[i, j] = 4
            elif cell_state == 'obstacle':
                grid[i, j] = 5

    cmap = colors.ListedColormap(['white', 'blue', 'orange', 'red', 'green', 'black'])  # Normal, start, frontier, visited, goal, obstacles
    norm = colors.BoundaryNorm([0, 1, 2, 3, 4, 5, 6], cmap.N)

    plot.set_data(grid)
    plot.set_cmap(cmap)
    plot.set_norm(norm)
    text.set_text(f'Frame: {frame+1}')
    
    return [plot, text]

# Example usage:
env = GridEnvironment(10, 10, 3)
steps = bfs(env)

# Setup for animation
fig, ax = plt.subplots()
ax.set_xlim(-0.5, P + 0.5)
ax.set_ylim(N + 0.5, -0.5)  # Inverser l'axe des y
ax.set_xticks(range(N+1))
ax.set_yticks(range(P+1))
ax.grid(which='both')

'''
plot = ax.imshow(np.zeros((env.grid_lines, env.grid_cols)), cmap='viridis', interpolation='none')
text = ax.text(0.5, -0.1, '', transform=ax.transAxes, ha='center')

ani = FuncAnimation(fig, update, frames=len(steps), fargs=(env, steps, plot, text), blit=True, repeat=False)
'''

plot = ax.imshow(np.zeros((env.grid_lines, env.grid_cols)), cmap='viridis', interpolation='none',extent=(0, P , N, 0 ))
text = ax.text(0.5, -0.1, '', transform=ax.transAxes, ha='center')

ani = FuncAnimation(fig, update, frames=len(steps), fargs=(env, steps, plot, text), blit=True, repeat=False)

# Afficher l'animation dans Jupyter Lab
HTML(ani.to_jshtml())

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
from queue import Queue

# Define GridEnvironment class
class GridEnvironment:
    def __init__(self, grid_lines, grid_cols, num_obstacles):
        self.grid_lines = grid_lines
        self.grid_cols = grid_cols
        self.start = (0, 0)
        self.goal = (grid_lines - 2, grid_cols - 5)
        self.obstacles = self.generate_obstacles(num_obstacles)
        self.cells = {(i, j): {'state': 'unvisited'} for i in range(grid_lines) for j in range(grid_cols)}
        self.cells[self.start]['state'] = 'start'
        self.cells[self.goal]['state'] = 'goal'
        for obs in self.obstacles:
            self.cells[obs]['state'] = 'obstacle'
    
    def generate_obstacles(self, num_obstacles):
        obstacles = set()
        while len(obstacles) < num_obstacles:
            x = np.random.randint(0, self.grid_lines)
            y = np.random.randint(0, self.grid_cols)
            if (x, y) != self.start and (x, y) != self.goal:
                obstacles.add((x, y))
        return obstacles

# Breadth First Search (BFS) Implementation
def bfs(env):
    start = env.start
    frontier = Queue()
    frontier.put(start)
    reached = set()
    reached.add(start)
    steps = []

    while not frontier.empty():
        current_step = {}
        current = frontier.get()
        current_step[current] = 'visited'
        
        if current == env.goal:
            steps.append(current_step)
            break

        # Get possible next states
        neighbors = get_neighbors(current, env)
        
        for next in neighbors:
            if next not in reached and next not in env.obstacles:
                frontier.put(next)
                reached.add(next)
                current_step[next] = 'frontier'
        
        steps.append(current_step)
    
    return steps

# Function to get neighbors of a cell
def get_neighbors(cell, env):
    x, y = cell
    neighbors = []
    if x > 0:
        neighbors.append((x-1, y))
    if x < env.grid_lines - 1:
        neighbors.append((x+1, y))
    if y > 0:
        neighbors.append((x, y-1))
    if y < env.grid_cols - 1:
        neighbors.append((x, y+1))
    return neighbors

# Animation update function
def update(frame, env, steps, plot, text):
    step = steps[frame]
    
    for i in range(env.grid_lines):
        for j in range(env.grid_cols):
            if (i, j) in step:
                env.cells[(i, j)]['state'] = step[(i, j)]

    grid = np.zeros((env.grid_lines, env.grid_cols))
    
    for i in range(env.grid_lines):
        for j in range(env.grid_cols):
            cell_state = env.cells[(i, j)]['state']
            if cell_state == 'unvisited':
                grid[i, j] = 0
            elif cell_state == 'start':
                grid[i, j] = 1
            elif cell_state == 'frontier':
                grid[i, j] = 2
            elif cell_state == 'visited':
                grid[i, j] = 3
            elif cell_state == 'goal':
                grid[i, j] = 4
            elif cell_state == 'obstacle':
                grid[i, j] = 5

    cmap = colors.ListedColormap(['white', 'blue', 'orange', 'red', 'green', 'black'])  # Normal, start, frontier, visited, goal, obstacles
    norm = colors.BoundaryNorm([0, 1, 2, 3, 4, 5, 6], cmap.N)

    plot.set_data(grid)
    plot.set_cmap(cmap)
    plot.set_norm(norm)
    text.set_text(f'Frame: {frame+1}')
    
    return [plot, text]

# Example usage:
env = GridEnvironment(10, 10, 15)
steps = bfs(env)


# Setup for animation
fig, ax = plt.subplots()
ax.set_xlim(-0.5, P + 0.5)
ax.set_ylim(N + 0.5, -0.5)  # Inverser l'axe des y
ax.set_xticks(range(N+1))
ax.set_yticks(range(P+1))
ax.grid(which='both')

plot = ax.imshow(np.zeros((env.grid_lines, env.grid_cols)), cmap='viridis', interpolation='none', extent=(0, env.grid_cols, env.grid_lines, 0))
text = ax.text(0.5, -0.1, '', transform=ax.transAxes, ha='center')

ani = FuncAnimation(fig, update, frames=len(steps), fargs=(env, steps, plot, text), blit=True, repeat=False)

# Display the animation in Jupyter Lab
HTML(ani.to_jshtml())

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
import heapq

# Define GridEnvironment class
class GridEnvironment:
    def __init__(self, grid_lines, grid_cols, num_obstacles):
        self.grid_lines = grid_lines
        self.grid_cols = grid_cols
        self.start = (0, 0)
        self.goal = (grid_lines - 2, grid_cols - 5)
        self.obstacles = self.generate_obstacles(num_obstacles)
        self.cells = {(i, j): {'state': 'unvisited', 'cost': np.random.randint(1, 10)} for i in range(grid_lines) for j in range(grid_cols)}
        self.cells[self.start]['state'] = 'start'
        self.cells[self.goal]['state'] = 'goal'
        for obs in self.obstacles:
            self.cells[obs]['state'] = 'obstacle'
            self.cells[obs]['cost'] = float('inf')
    
    def generate_obstacles(self, num_obstacles):
        obstacles = set()
        while len(obstacles) < num_obstacles:
            x = np.random.randint(0, self.grid_lines)
            y = np.random.randint(0, self.grid_cols)
            if (x, y) != self.start and (x, y) != self.goal:
                obstacles.add((x, y))
        return obstacles

# Dijkstra's Algorithm (Uniform Cost Search) Implementation
def dijkstra(env):
    start = env.start
    frontier = [(0, start)]
    reached = {start: 0}
    steps = []

    while frontier:
        current_cost, current = heapq.heappop(frontier)
        current_step = {current: 'visited'}
        
        if current == env.goal:
            steps.append(current_step)
            break

        neighbors = get_neighbors(current, env)
        
        for next in neighbors:
            if next not in env.obstacles:
                new_cost = current_cost + env.cells[next]['cost']
                if next not in reached or new_cost < reached[next]:
                    reached[next] = new_cost
                    heapq.heappush(frontier, (new_cost, next))
                    current_step[next] = 'frontier'
        
        steps.append(current_step)
    
    return steps

# Function to get neighbors of a cell
def get_neighbors(cell, env):
    x, y = cell
    neighbors = []
    if x > 0:
        neighbors.append((x-1, y))
    if x < env.grid_lines - 1:
        neighbors.append((x+1, y))
    if y > 0:
        neighbors.append((x, y-1))
    if y < env.grid_cols - 1:
        neighbors.append((x, y+1))
    return neighbors

# Animation update function
def update(frame, env, steps, plot, text):
    step = steps[frame]
    
    for i in range(env.grid_lines):
        for j in range(env.grid_cols):
            if (i, j) in step:
                env.cells[(i, j)]['state'] = step[(i, j)]

    grid = np.zeros((env.grid_lines, env.grid_cols))
    
    for i in range(env.grid_lines):
        for j in range(env.grid_cols):
            cell_state = env.cells[(i, j)]['state']
            if cell_state == 'unvisited':
                grid[i, j] = 0
            elif cell_state == 'start':
                grid[i, j] = 1
            elif cell_state == 'frontier':
                grid[i, j] = 2
            elif cell_state == 'visited':
                grid[i, j] = 3
            elif cell_state == 'goal':
                grid[i, j] = 4
            elif cell_state == 'obstacle':
                grid[i, j] = 5

    cmap = colors.ListedColormap(['white', 'blue', 'orange', 'red', 'green', 'black'])  # Normal, start, frontier, visited, goal, obstacles
    norm = colors.BoundaryNorm([0, 1, 2, 3, 4, 5, 6], cmap.N)

    plot.set_data(grid)
    plot.set_cmap(cmap)
    plot.set_norm(norm)
    text.set_text(f'Frame: {frame+1}')
    
    return [plot, text]

# Example usage:
env = GridEnvironment(10, 10, 15)
steps = dijkstra(env)

# Setup for animation

fig, ax = plt.subplots()
ax.set_xlim(-0.5, P + 0.5)
ax.set_ylim(N + 0.5, -0.5)  # Inverser l'axe des y
ax.set_xticks(range(N+1))
ax.set_yticks(range(P+1))
ax.grid(which='both')


plot = ax.imshow(np.zeros((env.grid_lines, env.grid_cols)), cmap='viridis', interpolation='none', extent=(0, env.grid_cols, env.grid_lines, 0))
text = ax.text(0.5, -0.1, '', transform=ax.transAxes, ha='center')

ani = FuncAnimation(fig, update, frames=len(steps), fargs=(env, steps, plot, text), blit=True, repeat=False)

# Display the animation in Jupyter Lab
HTML(ani.to_jshtml())

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
import heapq

# Define GridEnvironment class
class GridEnvironment:
    def __init__(self, grid_lines, grid_cols, num_obstacles):
        self.grid_lines = grid_lines
        self.grid_cols = grid_cols
        self.start = (0, 0)
        self.goal = (grid_lines - 2, grid_cols - 5)
        self.obstacles = self.generate_obstacles(num_obstacles)
        self.cells = {(i, j): {'state': 'unvisited', 'cost': np.random.randint(1, 10)} for i in range(grid_lines) for j in range(grid_cols)}
        self.cells[self.start]['state'] = 'start'
        self.cells[self.goal]['state'] = 'goal'
        for obs in self.obstacles:
            self.cells[obs]['state'] = 'obstacle'
            self.cells[obs]['cost'] = float('inf')
    
    def generate_obstacles(self, num_obstacles):
        obstacles = set()
        while len(obstacles) < num_obstacles:
            x = np.random.randint(0, self.grid_lines)
            y = np.random.randint(0, self.grid_cols)
            if (x, y) != self.start and (x, y) != self.goal:
                obstacles.add((x, y))
        return obstacles

# Dijkstra's Algorithm (Uniform Cost Search) Implementation
def dijkstra(env):
    start = env.start
    frontier = [(0, start)]
    reached = {start: 0}
    came_from = {start: None}
    steps = []

    while frontier:
        current_cost, current = heapq.heappop(frontier)
        current_step = {current: 'visited'}
        
        if current == env.goal:
            steps.append(current_step)
            break

        neighbors = get_neighbors(current, env)
        
        for next in neighbors:
            if next not in env.obstacles:
                new_cost = current_cost + env.cells[next]['cost']
                if next not in reached or new_cost < reached[next]:
                    reached[next] = new_cost
                    heapq.heappush(frontier, (new_cost, next))
                    current_step[next] = 'frontier'
                    came_from[next] = current
        
        steps.append(current_step)
    
    # Reconstruct path
    path = []
    current = env.goal
    while current is not None:
        path.append(current)
        current = came_from[current]
    path.reverse()

    return steps, path

# Function to get neighbors of a cell
def get_neighbors(cell, env):
    x, y = cell
    neighbors = []
    if x > 0:
        neighbors.append((x-1, y))
    if x < env.grid_lines - 1:
        neighbors.append((x+1, y))
    if y > 0:
        neighbors.append((x, y-1))
    if y < env.grid_cols - 1:
        neighbors.append((x, y+1))
    return neighbors

# Animation update function
def update(frame, env, steps, plot, text, costs, path_line):
    step = steps[frame]
    
    for i in range(env.grid_lines):
        for j in range(env.grid_cols):
            if (i, j) in step:
                env.cells[(i, j)]['state'] = step[(i, j)]

    grid = np.zeros((env.grid_lines, env.grid_cols))
    
    for i in range(env.grid_lines):
        for j in range(env.grid_cols):
            cell_state = env.cells[(i, j)]['state']
            if cell_state == 'unvisited':
                grid[i, j] = 0
            elif cell_state == 'start':
                grid[i, j] = 1
            elif cell_state == 'frontier':
                grid[i, j] = 2
            elif cell_state == 'visited':
                grid[i, j] = 3
            elif cell_state == 'goal':
                grid[i, j] = 4
            elif cell_state == 'obstacle':
                grid[i, j] = 5

    cmap = colors.ListedColormap(['white', 'blue', 'orange', 'red', 'green', 'black'])  # Normal, start, frontier, visited, goal, obstacles
    norm = colors.BoundaryNorm([0, 1, 2, 3, 4, 5, 6], cmap.N)

    plot.set_data(grid)
    plot.set_cmap(cmap)
    plot.set_norm(norm)
    text.set_text(f'Frame: {frame+1}')

    for (i, j), cost_text in costs.items():
        cost_text.set_text(env.cells[(i, j)]['cost'])

    # Draw the path if we reached the goal
    if frame == len(steps) - 1:
        path_coords = np.array(path)
        path_line.set_data(path_coords[:, 1] + 0.5, path_coords[:, 0] + 0.5)
    
    return [plot, text] + list(costs.values()) + [path_line]

# Example usage:
env = GridEnvironment(10, 10, 15)
steps, path = dijkstra(env)

fig, ax = plt.subplots()
ax.set_xlim(-0.5, P + 0.5)
ax.set_ylim(N + 0.5, -0.5)  # Inverser l'axe des y
ax.set_xticks(range(N+1))
ax.set_yticks(range(P+1))
ax.grid(which='both')

plot = ax.imshow(np.zeros((env.grid_lines, env.grid_cols)), cmap='viridis', interpolation='none', extent=(0, env.grid_cols, env.grid_lines, 0))
text = ax.text(0.5, -0.1, '', transform=ax.transAxes, ha='center')

# Display costs
costs = {}
for i in range(env.grid_lines):
    for j in range(env.grid_cols):
        costs[(i, j)] = ax.text(j + 0.5, i + 0.5, env.cells[(i, j)]['cost'], ha='center', va='center', color='black')

# Add path line
path_line, = ax.plot([], [], 'yellow', linewidth=2)

ani = FuncAnimation(fig, update, frames=len(steps), fargs=(env, steps, plot, text, costs, path_line), blit=True, repeat=False)

# Display the animation in Jupyter Lab
HTML(ani.to_jshtml())

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib.animation import FuncAnimation
from IPython.display import HTML
import heapq

# Define GridEnvironment class
class GridEnvironment:
    def __init__(self, grid_lines, grid_cols, num_obstacles):
        self.grid_lines = grid_lines
        self.grid_cols = grid_cols
        self.start = (0, 0)
        self.goal = (grid_lines - 2, grid_cols - 5)
        self.obstacles = self.generate_obstacles(num_obstacles)
        self.cells = {(i, j): {'state': 'unvisited', 'cost': np.random.randint(1, 10)} for i in range(grid_lines) for j in range(grid_cols)}
        self.cells[self.start]['state'] = 'start'
        self.cells[self.goal]['state'] = 'goal'
        for obs in self.obstacles:
            self.cells[obs]['state'] = 'obstacle'
            self.cells[obs]['cost'] = float('inf')
    
    def generate_obstacles(self, num_obstacles):
        obstacles = set()
        while len(obstacles) < num_obstacles:
            x = np.random.randint(0, self.grid_lines)
            y = np.random.randint(0, self.grid_cols)
            if (x, y) != self.start and (x, y) != self.goal:
                obstacles.add((x, y))
        return obstacles

# A* Algorithm Implementation
def a_star(env):
    start = env.start
    goal = env.goal
    frontier = []
    heapq.heappush(frontier, (0, start))
    came_from = {start: None}
    cost_so_far = {start: 0}
    steps = []

    while frontier:
        _, current = heapq.heappop(frontier)
        current_step = {current: 'visited'}

        if current == goal:
            steps.append(current_step)
            break

        neighbors = get_neighbors(current, env)

        for next in neighbors:
            if next not in env.obstacles:
                new_cost = cost_so_far[current] + env.cells[next]['cost']
                if next not in cost_so_far or new_cost < cost_so_far[next]:
                    cost_so_far[next] = new_cost
                    priority = new_cost + heuristic(goal, next)
                    heapq.heappush(frontier, (priority, next))
                    came_from[next] = current
                    current_step[next] = 'frontier'
        
        steps.append(current_step)
    
    # Reconstruct path
    path = []
    current = goal
    while current is not None:
        path.append(current)
        current = came_from[current]
    path.reverse()

    return steps, path

# Heuristic function (Manhattan distance)
def heuristic(a, b):
    (x1, y1) = a
    (x2, y2) = b
    return abs(x1 - x2) + abs(y1 - y2)

# Function to get neighbors of a cell
def get_neighbors(cell, env):
    x, y = cell
    neighbors = []
    if x > 0:
        neighbors.append((x-1, y))
    if x < env.grid_lines - 1:
        neighbors.append((x+1, y))
    if y > 0:
        neighbors.append((x, y-1))
    if y < env.grid_cols - 1:
        neighbors.append((x, y+1))
    return neighbors

# Animation update function
def update(frame, env, steps, plot, text, costs, path_line):
    step = steps[frame]
    
    for i in range(env.grid_lines):
        for j in range(env.grid_cols):
            if (i, j) in step:
                env.cells[(i, j)]['state'] = step[(i, j)]

    grid = np.zeros((env.grid_lines, env.grid_cols))
    
    for i in range(env.grid_lines):
        for j in range(env.grid_cols):
            cell_state = env.cells[(i, j)]['state']
            if cell_state == 'unvisited':
                grid[i, j] = 0
            elif cell_state == 'start':
                grid[i, j] = 1
            elif cell_state == 'frontier':
                grid[i, j] = 2
            elif cell_state == 'visited':
                grid[i, j] = 3
            elif cell_state == 'goal':
                grid[i, j] = 4
            elif cell_state == 'obstacle':
                grid[i, j] = 5

    cmap = colors.ListedColormap(['white', 'blue', 'orange', 'red', 'green', 'black'])  # Normal, start, frontier, visited, goal, obstacles
    norm = colors.BoundaryNorm([0, 1, 2, 3, 4, 5, 6], cmap.N)

    plot.set_data(grid)
    plot.set_cmap(cmap)
    plot.set_norm(norm)
    text.set_text(f'Frame: {frame+1}')

    for (i, j), cost_text in costs.items():
        cost_text.set_text(env.cells[(i, j)]['cost'])

    # Draw the path if we reached the goal
    if frame == len(steps) - 1:
        path_coords = np.array(path)
        path_line.set_data(path_coords[:, 1] + 0.5, path_coords[:, 0] + 0.5)
    
    return [plot, text] + list(costs.values()) + [path_line]

# Example usage:
env = GridEnvironment(10, 10, 15)
steps, path = a_star(env)

# Setup for animation
fig, ax = plt.subplots()
ax.set_xlim(-0.5, P + 0.5)
ax.set_ylim(N + 0.5, -0.5)  # Inverser l'axe des y
ax.set_xticks(range(N+1))
ax.set_yticks(range(P+1))
ax.grid(which='both')

plot = ax.imshow(np.zeros((env.grid_lines, env.grid_cols)), cmap='viridis', interpolation='none', extent=(0, env.grid_cols, env.grid_lines, 0))
text = ax.text(0.5, -0.1, '', transform=ax.transAxes, ha='center')

# Display costs
costs = {}
for i in range(env.grid_lines):
    for j in range(env.grid_cols):
        costs[(i, j)] = ax.text(j + 0.5, i + 0.5, env.cells[(i, j)]['cost'], ha='center', va='center', color='black')

# Add path line
path_line, = ax.plot([], [], 'yellow', linewidth=2)

ani = FuncAnimation(fig, update, frames=len(steps), fargs=(env, steps, plot, text, costs, path_line), blit=True, repeat=False)

# Display the animation in Jupyter Lab
HTML(ani.to_jshtml())

import pandas as pd
from datasets import load_dataset, load_metric
import re
from IPython.display import display, Markdown
import json
import requests
from tqdm import tqdm
import evaluate
from datetime import datetime

# pip install evaluate

accuracy = evaluate.load("accuracy")

print(accuracy.description)

Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
 Where:
TP: True positive
TN: True negative
FP: False positive
FN: False negative

# math = evaluate.load("competition_math") pip install git+https://github.com/hendrycks/math.git

dataset = load_dataset("lighteval/MATH", 'all', split='test[:100]')
df = pd.DataFrame(dataset)

def extract_boxed_answer(answer):
    '''
    Extracts the content within the last \boxed{} in the answer, handling nested braces
    '''
    pattern = re.compile(r'\\boxed{((?:[^{}]|\{(?:[^{}]|\{[^{}]*\})*\})*)}')
    matches = pattern.findall(answer)

    if matches:
        return matches[-1]
    return None

def get_MATH_QA(row_number=None):
    #Load the dataset

    if row_number is None:
        row_number=int(input("Please enter the row number (0-99)"))

    if row_number <0 or row_number >=len(df):
        raise ValueError("Row number must be between 0 and 99")

    selected_row = df.iloc[row_number]

    question = selected_row['problem']
    full_answer=selected_row['solution']
    short_answer = extract_boxed_answer(full_answer)

    return question, full_answer, short_answer

question, full_answer,short_answer = get_MATH_QA() # 14

prompt = """Giving this problem : {question}

please write agani the final solution, only the final answer in the last line without any extra text"""
s0=prompt.format(question=question)

display(Markdown("question : "+ question))
display(Markdown("full ======= > " +full_answer))
display(Markdown("_____________ > " + short_answer))

display(Markdown('### $\\boxed{ '+extract_boxed_answer(full_answer)+ '}$'))

def llm(messages, temperature=0.2, stream=False):
    headers = {
        "Content-Type": "application/json",
    }
    data = {
        "model": "gemma2:latest",
        "messages": [
    {
      "role": "user",
      "content": messages
    }
  ],
        "stream" : False,
        "options": {
            "seed": 101,
            "temperature": temperature,
            "max_token" : 1500
          }
    }
    api_chat_endpoint="http://localhost:11434/api/chat"         
    response = requests.post(api_chat_endpoint, headers=headers, data=json.dumps(data), stream=stream)
    ret=""
    if response.status_code == 200:
        for line in response.iter_lines():
            if line:
                line = line.decode('utf-8').strip()
                #print("stripped line=", line, "", "")
                try:
                    content = json.loads(line)["message"]["content"]
                    ret+= content
                except Exception as e:
                    print(f"Impossible de décoder la ligne JSON : {line}, erreur: {e}")
                    continue
    else:
        print(f"API Error {response.status_code}: {response.text}")
        raise Exception(f"API Error {response.status_code}: {response.text}")
    return ret

def llm2(user_msg,assistant_msg, temperature=0.9, stream=False):
    headers = {
        "Content-Type": "application/json",
    }
    data = {
        "model": "gemma2:latest",
        "messages": [
    {
      "role": "user",
      "content": user_msg
    },
    {
      "role": "assistant",
      "content": assistant_msg
    },      
  ],
        "stream" : False,
        "options": {
            # "seed": 101,
            "temperature": temperature,
            "max_token" : 1500,
            "stop" : ["\n","\\n","\n\n","\\n\\n"]
          }
    }
    api_chat_endpoint="http://localhost:11434/api/chat"         
    response = requests.post(api_chat_endpoint, headers=headers, data=json.dumps(data), stream=stream)
    ret=""
    if response.status_code == 200:
        for line in response.iter_lines():
            if line:
                line = line.decode('utf-8').strip()
                #print("stripped line=", line, "", "")
                try:
                    content = json.loads(line)["message"]["content"]
                    ret+= content
                except Exception as e:
                    print(f"Impossible de décoder la ligne JSON : {line}, erreur: {e}")
                    continue
    else:
        print(f"API Error {response.status_code}: {response.text}")
        raise Exception(f"API Error {response.status_code}: {response.text}")
    return ret

prompt = """Problem : {question}

please think step by step, separate reasoning steps by double new lines, outpout the final answer in the last line without any extra text"""

prompt=prompt.format(question=question)
print(prompt)

Problem : Kite $ABCD$ (a quadrilateral with two pairs of adjacent equal sides) has coordinates $A\ (0,7),\ B\ (1,0),\ C\ (12,-2),$ and $D\ (7,8).$ What is the area of $ABCD,$ given that the area of a kite is equal to half the product of its diagonals?

[asy]
string sp(pair P, string P2){return "$" + P2 + "\,(" + string(P.x) + "," + string(P.y) + ")$";}
size(150); defaultpen(fontsize(10)); draw((-5,0)--(15,0),Arrows(4)); draw((0,-5)--(0,10),Arrows(4)); pair A=(0,7),B=(1,0),C=(12,-2),D=(7,8); draw(A--B--C--D--cycle, linewidth(0.7)); draw(A--C, dashed); draw(B--D, dashed); label(sp(A,"A"),A,W); label(sp(B,"B"),B,S); label(sp(C,"C"),C,E); label(sp(D,"D"),D,N);
[/asy]

please think step by step, separate reasoning steps by double new lines, outpout the final answer in the last line without any extra text

ret= llm(prompt) # +"\nPlease output the final answer on the last line of your output without any prefix or suffix")

display(Markdown(ret))

# Fonction d'évaluation
def evaluate_rep(example):
    prompt = example['problem']
    prediction = llm(f"Giving this problem : {prompt}\n\nplease write agani the final solution, only the final answer in the last line without any extra tex").strip().split("\n")[-1]
    return {
        'reference': extract_boxed_answer(example['solution']),
        'prediction': prediction
    }

len(dataset)

100

#Fonction d'évaluation
def evaluate_rep(example):
    prompt = example['problem']
    prediction = llm(f"Giving this problem : {prompt}\n\nplease write agani the final solution, only the final answer in the last line without any extra tex").strip().split("\n")[-1]
    return {
        'reference': extract_boxed_answer(example['solution']),
        'prediction': prediction
    }
# Application de l'évaluation sur le dataset
results = []
for example in tqdm(dataset):
    results.append(evaluate_rep(example))

with open("math_eval_01.json", 'w') as file:
    json.dump(results, file, indent=4)

for r in results[:10]:
    print(r)

{'reference': '2', 'prediction': '**Answer:** 2'}
{'reference': '10', 'prediction': '10'}
{'reference': '\\dfrac{9}{7}', 'prediction': 'The only solution that works is $x = \\boxed{\\frac{2}{7}}$.'}
{'reference': 'i', 'prediction': 'i'}
{'reference': '4', 'prediction': '**Answer:** 4'}
{'reference': '402', 'prediction': '402'}
{'reference': 'x \\in [-2,7]', 'prediction': '**Solution in interval notation:** [-2, 7]'}
{'reference': '7', 'prediction': '**Answer:** 6'}
{'reference': '4,6,14,15', 'prediction': '4, 6, 12, 18'}
{'reference': '-\\frac{1}{8}', 'prediction': 'We have two possible solutions: x = 3/2 and x = -1/8. The smallest of these is **-1/8**.'}

# Calcul des scores
predictions = [result['prediction'] for result in results]
references = [result['reference'] for result in results]

# Initialisation des listes pour stocker les références et les prédictions nettoyées
references = []
predictions = []

# Fonction pour remplacer tout ce qui est entre deux ** par le contenu entre eux
def clean_prediction(prediction):
    # Remplacer tout ce qui est entre deux ** par le contenu entre eux
    cleaned = re.sub(r'\*\*(.*?)\*\*', r'\1', prediction)
    # Supprimer 'Finale answer'
    cleaned = cleaned.replace('Final answer:', '').strip()

    cleaned = cleaned.replace('Answer:', '').strip()
    
    return cleaned

# Parcourir chaque dictionnaire dans la liste
for item in results:
    # Ajouter la référence à la liste des références
    references.append(item['reference'])
    
    # Nettoyer la prédiction et l'ajouter à la liste des prédictions
    cleaned_prediction = clean_prediction(item['prediction']).strip()
    predictions.append(cleaned_prediction)

# Afficher les listes obtenues
print("References: ____________ ", references)
print("Predictions: ___________ ", predictions)

References: ____________  ['2', '10', '\\dfrac{9}{7}', 'i', '4', '402', 'x \\in [-2,7]', '7', '4,6,14,15', '-\\frac{1}{8}', '\\frac{x+2}{7}', '-15', '10', '8', '75', '\\frac{11}{2}', '-25', '8', '3', '187.5', '18', '\\$40', '5', '8', '3125', '[0,\\infty)', '.5', '12, 10, 6', '5', '16', '2300', '5', '105', '-13.5', '\\frac{243}{625}', '2', '(-\\sqrt{3}, \\sqrt{3})', '23', '49', '2x^9 - 8x^7 + 9x^6 - 16x^5 - 12x^4 + 9x^3 - 24x^2', '(-\\infty,-8)\\cup (8,\\infty)', '0', '2', '16', '\\frac{1}{12}', '6+9i', '2', '20', '7(x+3) (x-3)', 'y^4-2y^3+7y^2+y-5', '4', '0', '12', '\\frac{7}{2}', '69', '5', '\\left(-\\infty,-\\frac 12\\right)\\cup \\left(-\\frac 12,\\infty\\right)', '\\frac{2}{5}', '7', '20', '7', '\\sqrt{x}', '78', '9', '4', '6', '17', '\\left(1,\\frac{9}{2}\\right)', '\\frac{15}{2}', '-2', '8', '20', '24', '\\frac{19}{4}', '5', '-55', '60', '-7', '0.43', '108', '4950', '50', '14', '8', '26', '129', '0', '-5', '2', '4', '30', '161', '1', '5', '3s^2', '125', '8', '286', '(9,11)', '\\dfrac{1}{5}']
Predictions: ___________  ['2', '10', 'The only solution that works is $x = \\boxed{\\frac{2}{7}}$.', 'i', '4', '402', 'Solution in interval notation: [-2, 7]', '6', '4, 6, 12, 18', 'We have two possible solutions: x = 3/2 and x = -1/8. The smallest of these is -1/8.', '* *h⁻¹(x) = (x + 2) / 7*', '```', '10', '8', "Let me know if you'd like me to work through the calculations step-by-step!", '5.5', '-25', "Let me know if you'd like to explore a similar problem with slightly different conditions!", '3', '187.5', '16', '$20.00', '* Therefore, x = 5', '8', '3125', '[0, ∞)', '1/2', '1, 4, 9, 16', '0', 'The largest possible value of b is 16.', '$2300', '5', '67.5', '17.3', 'Calculate (3/5)^7 and then multiply by (125/9). This will give you the eighth term as a fraction.', 'n = 2', '$-\\sqrt{7} < x < \\sqrt{7}$', 'Finally,  $a + b + c = 1 + 2 + \\sqrt{6} = \\boxed{3+\\sqrt{6}}$.', '49', '$$2x^9 - 8x^7 + 9x^6 - 16x^5 - 12x^4 + 9x^3 - 24x^2$$', 'The solution is  $m \\in (-\\infty, -8) \\cup (8, \\infty)$.', 'Therefore, the value of $b$ is $\\boxed{0}$.', '2', 'The bookstore should charge $16 to maximize its revenue.', 'w = 1/12', '6 + 9i', '2', 'Therefore, the simplified form of (2 - 2i)(5 + 5i) is 20.', 'Therefore, the factored expression is: $\\boxed{7(x + 3)(x - 3)}$', '$$y^4 - 2y^3 + 7y^2 + y - 5$$', '6', '*f(x) - f*<sup>-1</sup>(*x*) = *f(x) - f(x)* = 0', '107', '7/2', "Let me know if you'd like me to work through the specific calculations!", '25/4', '(-∞, -1/2) U (-1/2, ∞)', '4/10 = 2/5', '* Final  7', 'Therefore, the simplified form of (3 - i)(6 + 2i) is 20.', '7', 'x^(7/18)', '78', '9', '4', '6', '108', 'Therefore, the midpoint of $\\overline{PQ}$ is $(1, \\frac{9}{2})$.', '7', '-2', '8', '20', '24', 'k = 65/8', '5', '-55', '√[3]{216000} = 60', 'x = -7', '0.71', '108', '4950', "Let me know if you'd like me to work through the detailed solution steps!", 'There is no solution that satisfies the given conditions.', '8 quarts', '26', '360', '0', 'b = -5', 'e = 2', '4', '30', "Let me know if you'd like me to work through the entire calculation!", '1', '5', '$3s^2$', "Let me know if you'd like me to work through the detailed algebraic steps!", '8', '287', 'All four points lie on different lines.', '1/5']

score = evaluate.load('exact_match').compute(predictions=predictions, references=references)

# Affichage des résultats
print(f"Score exact_match sur le dataset lighteval/MATH: {score}")

Score exact_match sur le dataset lighteval/MATH: {'exact_match': 0.36}

score = evaluate.load('competition_math').compute(predictions=predictions, references=references)
print(f"Score exact_match sur le dataset lighteval/MATH: {score}")

Score exact_match sur le dataset lighteval/MATH: {'accuracy': 0.45}

# pip install git+https://github.com/hendrycks/math.git

MAX_STEPS=100

prompt="""Question : {question}

Please think step by step and output each reasoning step in a new line.
You MUST output the very final answer value in the last line without any prefix or extra text, ONLY the answer numerical value.

"""

def step(s0, state, action):
    next_step=llm2(s0, state+'\n'+action)
    return next_step

s0=prompt.format(question=question)

st0=step(s0,"","")
print(st0)

 To find $120\%$ of 30, we need to multiply $30$ by $\frac{120}{100}$, which is equivalent to multiplying by $1.2$.

llm(s0)

' Step 1: Calculate $120\\%$ of 30\n$120\\%$ of 30 is equal to $(120/100) * 30 = 1.2 * 30 = 36$.\n\nStep 2: Calculate $130\\%$ of 20\n$130\\%$ of 20 is equal to $(130/100) * 20 = 1.3 * 20 = 26$.\n\nStep 3: Find the positive difference between the two values\nThe positive difference between $36$ and $26$ is $36 - 26 = 10$.\n\nFinal answer: 10'

def process_text(text):
    # Convert the text to a list of lines
    lines = text.split('\n')
    # Strip empty lines at the beginning and the end
    stripped_lines = [line for line in lines if line.strip()]
    
    return stripped_lines

# Exemple de texte multi-lignes
llm_traj = """
    This is the first line.

    This is the second line.
    
    This is the third line.
    
"""

# Appel de la fonction
result = process_text(llm_traj)

# Afficher le résultat
print(result)

['    This is the first line.', '    This is the second line.', '    This is the third line.']

def trajectory(s0, state):
    states=[]
    traj=llm(s0+'\n'+state)
    return process_text(traj)

traj1=trajectory(s0,st0)

traj1

[' So, $120\\%$ of 30 is:',
 '$$30 \\times 1.2 = 36.$$',
 'To find $130\\%$ of 20, we need to multiply $20$ by $\\frac{130}{100}$, which is equivalent to multiplying by $1.3$.',
 'So, $130\\%$ of 20 is:',
 '$$20 \\times 1.3 = 26.$$',
 'The positive difference between these two values is:',
 '$$36 - 26 = 10.$$',
 'Therefore, the final answer is $\\boxed{10}$.']

def nettoyer_chaine(chaine):
    # Supprimer les lignes vides à la fin
    lignes = chaine.splitlines()
    while lignes and not lignes[-1].strip():
        lignes.pop()
    
    # Conserver uniquement la dernière ligne non vide
    derniere_ligne = lignes[-1] if lignes else ""
    
    # Supprimer les espaces en début et fin de chaîne
    derniere_ligne = derniere_ligne.strip().lower()

    # Expression régulière mise à jour pour capturer le texte après "solution"
    pattern = r"^\**solution\**:* *\**\s*(.*)$"
    match = re.match(pattern, derniere_ligne, re.IGNORECASE)
    if match:
        derniere_ligne=  match.group(1).strip()

    pattern = r"^\**answer\**:* *\**\s*(.*)$"
    match = re.match(pattern, derniere_ligne, re.IGNORECASE)
    if match:
        derniere_ligne=  match.group(1).strip()

    pattern = r"^\**final answer\**:* *\**\s*(.*)$"
    match = re.match(pattern, derniere_ligne, re.IGNORECASE)
    if match:
        derniere_ligne=  match.group(1).strip()

    pattern = r"^\**final solution\**:* *\**\s*(.*)$"
    match = re.match(pattern, derniere_ligne, re.IGNORECASE)
    if match:
        derniere_ligne=  match.group(1).strip()
    
    # Supprimer à nouveau les espaces au cas où
    derniere_ligne = derniere_ligne.strip()
    cl= extract_boxed_answer(derniere_ligne)
    if cl is not None:
        derniere_ligne=cl
        
    return derniere_ligne

metric=evaluate.load('exact_match')

def reward(s0,state,action):
    final_ret=llm(s0+'\n'+'\n'+state)
    final_answer =  nettoyer_chaine(final_ret)
    gt_answer=short_answer
    t,r1 = evaluer_expression_latex(gt_answer)
    if t:
        #gt_answer=str(r1)
        t2,r2 = evaluer_expression_latex(final_answer)
        if t2:
            if r1==r2:
                #print("Rawrd = 1")
                return 1
                
    score = metric.compute(predictions=[final_answer], references=[gt_answer])

    # Affichage des résultats
    #print(f"Score  {final_answer} /   {gt_answer} : {score}")
    return score['exact_match']

import sympy as sp
from latex2sympy2 import latex2sympy

def evaluer_expression_latex(expression):
    try:
        # Convertir l'expression LaTeX en une expression sympy
        expr_sympy = latex2sympy(expression)
        # Évaluer l'expression et comparer à la valeur donnée
        expr_valeur = expr_sympy.evalf()
        return True, expr_valeur
    except Exception as e:
        # print(f"Erreur lors de l'interprétation de l'expression : {e}")
        return False,0

#!pip install latex2sympy2

def roll_out(s0,state,action):
    pool = []
    nb_iterations=1
    for i in range(nb_iterations):
        r1=llm2(s0,state+'\n'+action)
        r2=llm2(s0+'\n'+"\Solution :\n",state+'\n'+action)
        r3=llm2(s0+'\nSteps :\n',state+'\n'+action)
        pool.append(r1)
        pool.append(r2)
        pool.append(r3)
    return pool

pool0=roll_out(s0,"","")

pool0

[' To find the positive difference between $120\\%$ of 30 and $130\\%$ of 20, we will first calculate each percentage individually and then subtract one from the other.',
 ' First, we need to calculate $120\\%$ of 30:',
 ' Step 1: Calculate $120\\%$ of 30.']

reward(s0,"",pool0[3])

1

def remove_duplicates(input_list):
    seen = set()
    return [x for x in input_list if not (x in seen or seen.add(x))]

clean_pool0=remove_duplicates(pool0)

clean_pool0

[' To find the positive difference between $120\\%$ of 30 and $130\\%$ of 20, we will first calculate each percentage individually and then subtract one from the other.',
 ' First, we need to calculate $120\\%$ of 30:',
 ' Step 1: Calculate $120\\%$ of 30.']

s0

'Question : What is the positive difference between $120\\%$ of 30 and $130\\%$ of 20?\n\nPlease think step by step and output each reasoning step in a new line.\nYou MUST output the very final answer value in the last line without any prefix or extra text, ONLY the answer numerical value.\n\n'

qvalues_labels=[]
tot=len(pool0)
i=0
# Obtenir la date et l'heure actuelles
current_time = datetime.now()

# Afficher la date et l'heure actuelles
print("Date et heure actuelles :", current_time)

for p in clean_pool0:
    i+=1
    print(f"{i}/{tot}")
    tr=trajectory(s0,p)
    #print("trajectoire ________ ", tr)
    # for each state action in trajectory pool
    appendtr=""
    for sa in tr :
        # roll out
        sapool= roll_out(s0,p,sa)
        clean_sapool = remove_duplicates(sapool)
        #print("cleaned __________", clean_sapool)
        #compute reward
        best_reward=0
        for action in clean_sapool:
            actual_reward=reward(s0,sa,action)
            if actual_reward >=best_reward:
                best_reward=actual_reward
        #selectes best reward
        # add to qvalues_labels
        qvalues_labels.append(((s0, p+'\n'+appendtr+sa,action),best_reward))
        appendtr+=sa
# Obtenir la date et l'heure actuelles
current_time = datetime.now()
# Afficher la date et l'heure actuelles
print("Date et heure actuelles :", current_time)

# Convertir les tuples en listes pour la compatibilité JSON
data_to_save = [list(item) for item in qvalues_labels]

with open('qvalues_labels.json', 'w') as f:
    json.dump(data_to_save, f)

# Charger les données depuis un fichier JSON
with open('qvalues_labels.json', 'r') as f:
    loaded_data = json.load(f)

# Convertir les listes en tuples
data_reloaded = [tuple(item) for item in loaded_data]

qvalues_labels=[]

current_time = datetime.now()
# Afficher la date et l'heure actuelles
print("Date et heure actuelles :", current_time)

for i in tqdm(range(len(df))):
    dd=df.iloc[i]
    question = dd['problem']
    full_answer=dd['solution']
    short_answer = extract_boxed_answer(full_answer)
    s0=prompt.format(question=question)
    pool0=roll_out(s0,"","")
    clean_pool0=remove_duplicates(pool0)
    for p in clean_pool0:
        tr=trajectory(s0,p)
        #print("trajectoire ________ ", tr)
        # for each state action in trajectory pool
        appendtr=""
        for sa in tr :
            # roll out
            sapool= roll_out(s0,p,sa)
            clean_sapool = remove_duplicates(sapool)
            #print("cleaned __________", clean_sapool)
            #compute reward
            best_reward=0
            for action in clean_sapool:
                actual_reward=reward(s0,sa,action)
                if actual_reward >=best_reward:
                    best_reward=actual_reward
            #selectes best reward
            # add to qvalues_labels
            qvalues_labels.append(((s0, p+'\n'+appendtr+sa,action),best_reward))
            appendtr+=sa
# Obtenir la date et l'heure actuelles
current_time = datetime.now()
# Afficher la date et l'heure actuelles
print("Date et heure actuelles :", current_time)
data_to_save = [list(item) for item in qvalues_labels]
with open('qvalues_labels.json', 'w') as f:
    json.dump(data_to_save, f)

data_to_save = [list(item) for item in qvalues_labels]
with open('qvalues_labels.json', 'w') as f:
    json.dump(data_to_save, f)

len(data_reloaded)

3805

data_reloaded[0]

(['Question : How many vertical asymptotes does the graph of $y=\\frac{2}{x^2+x-6}$ have?\n\nPlease think step by step and output each reasoning step in a new line.\nYou MUST output the very final answer value in the last line without any prefix or extra text, ONLY the answer numerical value.\n\n',
  '1. **Vertical asymptotes occur where the denominator of a rational function equals zero.**\n2. **Factor the denominator:**  $x^2 + x - 6 = (x+3)(x-2)$',
  ''],
 1)

EMBEDDING_ENDPOINT = "http://localhost:11434/api/embeddings"
def embeddings(prompt):
    headers = {
        "Content-Type": "application/json",
    }

    data = {
        "model" :  "nomic-embed-text",
        "prompt": prompt
    }
    response = requests.post(EMBEDDING_ENDPOINT, headers=headers, data=json.dumps(data))
    try:
        return response.json()['embedding']
    except Exception as e:
        print("Embedding Error")
    return torch.zeros(768, dtype=torch.float32)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Définition du modèle
class QNetwork(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(embedding_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        self.relu = nn.ReLU()

    def forward(self, state_embedding, action_embedding):
        x = torch.cat((state_embedding, action_embedding), dim=-1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        q_value = self.fc3(x)
        return q_value

# Paramètres
embedding_dim = 768
hidden_dim = 256
batch_size = 32
num_epochs = 10
learning_rate = 0.001


# Modèle
q_network = QNetwork(embedding_dim, hidden_dim)

# Optimiseur
optimizer = optim.Adam(q_network.parameters(), lr=0.001)

# Fonction de perte
criterion = nn.MSELoss()


# Fonction d'embedding (à remplacer par votre fonction d'embeddings)
def embed_text(text):
    # Exemple de fonction d'embedding retournant un vecteur aléatoire
    return  torch.tensor(embeddings(text))

# Exemple de données
state = "example state text"
action = "example action text"
value = 0.5

# Embedding des données
state_embedding = embed_text(state)
action_embedding = embed_text(action)

# Préparation des données pour l'entraînement
state_embedding = state_embedding.unsqueeze(0)  # Ajouter dimension batch
action_embedding = action_embedding.unsqueeze(0)  # Ajouter dimension batch
value = torch.tensor([value])

# Forward pass
predicted_q_value = q_network(state_embedding, action_embedding)

# Calcul de la perte
loss = criterion(predicted_q_value, value)

# Backward pass et optimisation
optimizer.zero_grad()
loss.backward()
optimizer.step()

print("Predicted Q Value:", predicted_q_value.item())

Predicted Q Value: 0.0895596370100975

/home/imed/anaconda3/envs/ime/lib/python3.11/site-packages/torch/nn/modules/loss.py:535: UserWarning: Using a target size (torch.Size([1])) that is different to the input size (torch.Size([1, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
  return F.mse_loss(input, target, reduction=self.reduction)

class QDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        state_action, value = self.data[idx]
        state, action = state_action[0], state_action[1]
        state_embedding = embed_text(state)
        action_embedding = embed_text(action)
        return state_embedding, action_embedding, torch.tensor(value, dtype=torch.float32)

# Création du DataLoader
dataset = QDataset(data_reloaded)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

len(dataset)

3805

# Boucle d'entraînement
i=0
for epoch in range(num_epochs):
    i+=1
    print(f"{i}/{num_epochs}")
    for state_embedding, action_embedding, value in tqdm(dataloader):
        # Forward pass
        predicted_q_value = q_network(state_embedding, action_embedding).squeeze()

        # Calcul de la perte
        loss = criterion(predicted_q_value, value)

        # Backward pass et optimisation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

print("Entraînement terminé.")

num_epochs=10
from tqdm import tqdm
import matplotlib.pyplot as plt

from tqdm import tqdm

# Boucle d'entraînement
loss_history = []
i = 0
for epoch in range(num_epochs):
    i += 1
    print(f"{i}/{num_epochs}")
    epoch_loss = 0
    with tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False) as pbar:
        for state_embedding, action_embedding, value in pbar:
            # Forward pass
            predicted_q_value = q_network(state_embedding, action_embedding).squeeze()

            # Calcul de la perte
            loss = criterion(predicted_q_value, value)

            # Backward pass et optimisation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Ajouter la perte à la perte de l'époque
            epoch_loss += loss.item()

            # Mettre à jour la barre de progression avec la perte actuelle
            pbar.set_postfix(loss=loss.item())

    # Calculer la perte moyenne pour cette époque
    epoch_loss /= len(dataloader)
    loss_history.append(epoch_loss)

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

1/10

Epoch 1/10:  95%|█████████▍| 113/119 [12:01<00:38,  6.38s/it, loss=0.0581]

Embedding Error

/tmp/ipykernel_4010/3163542839.py:22: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  return  torch.tensor(embeddings(text))

Epoch 1/10, Loss: 0.1097
2/10

Epoch 2/10, Loss: 0.1114
3/10

Epoch 3/10:  15%|█▌        | 18/119 [01:54<10:43,  6.37s/it, loss=0.0635]

Embedding Error

# Afficher l'historique des pertes
plt.plot(range(1, num_epochs + 1), loss_history, marker='o')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.grid(True)
plt.show()

# Afficher l'historique des pertes
plt.plot(range(1, num_epochs + 1), loss_history, marker='o')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.grid(True)
plt.show()

# Sauvegarder l'état du modèle
torch.save(q_network.state_dict(), "qv01.pth")

q_network.load_state_dict(torch.load("qv01.pth"))

<All keys matched successfully>

def last_4_lines_empty(input_str):
    # Vérifier si la chaîne est vide
    if input_str == "":
        return True

    # Séparer la chaîne en lignes
    lines = input_str.split('\n')
    
    # Si la chaîne a moins de 4 lignes, retourner False
    if len(lines) < 4:
        return False
    
    # Prendre les 4 dernières lignes
    last_4_lines = lines[-6:]
    
    # Vérifier si chaque ligne est vide
    for line in last_4_lines:
        if line.strip() != '':
            return False
    return True

prompt="""Question : {question}

Please think step by step and output each reasoning step in a new line.
You MUST output the very final answer value in the last line without any or extra text, ONLY the answer numerical value prefixed by **final answer**.

"""

#start with question
#roll out
#compute qvalue with qmodel
#use best step
#loop until find soltion
def q_start(question):

    s0=prompt.format(question=question)
    #print(llm(s0))
    found=False
    state=""
    action=""
    i=0
    while not found:
        i+=1
        '''
        print(f"______{i}____________________{state}=========={action}_____")
        print(f"__________________________{i}")
        '''
        pool=roll_out(s0,state,action)
        #print(pool)
        best_score=0.
        nex_action=pool[0]
        #print("nex_action=",nex_action)
        for n_action in pool:
            #print("state=_",state)
            #print("action_",n_action)
            if state =="":
                state='\n'
            if n_action == "":
                n_action='\n'
            # score=reward(s0,state,action)
            # score=q_network(torch.tensor(embeddings(state), dtype=torch.float32),torch.tensor(embeddings(action), dtype=torch.float32))
            score=q_network(torch.tensor(embeddings(state), dtype=torch.float32),torch.tensor(embeddings(n_action), dtype=torch.float32))
            score=float(score)
            if score >= best_score:
                nex_action=n_action
        
        state=state+'\n'+action
        action=nex_action
        if action =="" or action is None:
            break
        if "final answer" in action.lower():
            break
        if last_4_lines_empty(state) :
            break
            
    #print(f"{state}\n{action}")
    return state+'\n'+action

# Obtenir la date et l'heure actuelles
current_time = datetime.now()

# Afficher la date et l'heure actuelles
print("Date et heure actuelles :", current_time)
ans=q_start(question)
print(nettoyer_chaine(ans))
# Obtenir la date et l'heure actuelles
current_time = datetime.now()

# Afficher la date et l'heure actuelles
print("Date et heure actuelles :", current_time)

Date et heure actuelles : 2024-07-14 09:17:53.111912


1. Calculate 120% of 30: (120/100) * 30 = 36


2. Calculate 130% of 20: (130/100) * 20 = 26


3. Find the positive difference between 36 and 26: 36 - 26 = 10


final answer 10
10
Date et heure actuelles : 2024-07-14 09:18:06.970778

dataset = load_dataset("lighteval/MATH", 'all', split='test[:100]')
#Fonction d'évaluation
def evaluate_rep2(example):
    prompt = example['problem']
    prediction = nettoyer_chaine(q_start(prompt))
    return {
        'reference': extract_boxed_answer(example['solution']),
        'prediction': prediction
    }
# Application de l'évaluation sur le dataset
results = []
for example in tqdm(dataset):
    results.append(evaluate_rep2(example))

with open("math_eval_02.json", 'w') as file:
    json.dump(results, file, indent=4)

  1%|          | 1/100 [00:17<28:44, 17.42s/it]

Embedding Error

/tmp/ipykernel_2906/3991589663.py:34: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  score=q_network(torch.tensor(embeddings(state), dtype=torch.float32),torch.tensor(embeddings(n_action), dtype=torch.float32))
 13%|█▎        | 13/100 [11:25<1:24:50, 58.51s/it]

Embedding Error

 17%|█▋        | 17/100 [14:34<1:02:03, 44.87s/it]

Embedding Error

 18%|█▊        | 18/100 [15:47<1:12:54, 53.35s/it]

Embedding Error

 27%|██▋       | 27/100 [22:23<56:02, 46.06s/it]

Embedding Error

# Calcul des scores
predictions = [result['prediction'] for result in results]
references = [result['reference'] for result in results]

score = evaluate.load('exact_match').compute(predictions=predictions, references=references)

# Affichage des résultats
print(f"Score exact_match sur le dataset lighteval/MATH: {score}")

Score exact_match sur le dataset lighteval/MATH: {'exact_match': 0.52}

score = evaluate.load('rouge').compute(predictions=predictions, references=references)
print(f"Score exact_match sur le dataset lighteval/MATH: {score}")

Score exact_match sur le dataset lighteval/MATH: {'rouge1': 0.6206349206349205, 'rouge2': 0.08333333333333331, 'rougeL': 0.6228888888888888, 'rougeLsum': 0.6239682539682538}

score = evaluate.load('competition_math').compute(predictions=predictions, references=references)
print(f"Score exact_match sur le dataset lighteval/MATH: {score}")

Score exact_match sur le dataset lighteval/MATH: {'accuracy': 0.56}

Q* Illustrated ¶

Minimal Implementation for Lighteval/MATH benchmark¶

Ime 07/2024 ¶

(*) illustration from code_your_own_AI (https://www.youtube.com/watch?v=LCDofAE5gYw).¶

Section 1: Introduction to Q-Learning and Reinforcement Learning ¶

Section 2: A* and Monte Carlo Tree Search (MCTS)* ¶

Section 3: Implementing Q* with PyTorch ¶

Figure from official paper (https://arxiv.org/pdf/2406.14283)¶

Section 4: Case Study - Testing on the 🤗 lighteval/MATH Dataset ¶

Q Learning ¶

1. Initialize the Q-values arbitrarily for all state-action pairs.¶

2. Observe the current state.¶

3. Select an action based on the current Q-values (typically using an epsilon-greedy strategy to balance exploration and exploitation).¶

4. Take the action, observe the reward and the new state.¶

5. Update the Q-value for the state-action pair based on the reward received and the maximum Q-value of the next state.¶

6. Repeat the process until the Q-values converge.¶

Pseudo code¶

Let's code a simple implementation to illustrate it¶

Our Environement will be a grill with a givent start cell, goal cell aand some obstacles celles¶

We must find the best path from start to goal¶

Explanations¶

Initialization: The agent initializes the Q table with zero values ​​for each state-action pair.¶

Action choice: An action is chosen according to an epsilon-greedy policy which allows both exploration (random choice) and exploitation (choice of the action with the maximum Q value).¶

Q update: After each action, the Q table is updated according to the Bellman equation, taking into account the reward obtained and the maximum Q value of the new state.¶

Training: The agent trains by repeating episodes, where each episode corresponds to a sequence of states, actions and rewards, ending when a terminal state is reached.¶

Test env¶

Grid NxP cells, start on position (0,0) and goal on (N-1,P-1).¶

Actions: "up", "down", "left", "right".¶

Reward: +100 if goral is reached, -100 if obstacle, -1 for all other cases.¶

Grid Actions initial states¶

Animation¶

Deep reinforcement learning ¶

Using neural networks to estimate the Q table¶

Allows you to manage large or continuous state and action spaces, where the traditional Q table would become too large or inefficient¶

Neural network architectures ¶

Fully Connected Networks: Simple and widely used for medium-sized state spaces.¶

Convolutional Neural Networks (CNN): Used primarily in environments where the input data is images, such as in video games.¶

Recurrent Neural Networks (RNN): Suitable for environments where past states are important for predicting the future, such as in games with time sequence elements.¶

Dueling Networks: Architecture where the network is split into two paths to separately learn the value of states and the benefit of actions, then combining to form an estimate of Q.¶

Deep Q-Network (DQN): Introduces concepts like experiment replay and target network to improve learning stability.¶

Adapt the code to use a neural network

To use a neural network to estimate the Q table, here are the key adaptations to make:¶

Replace the Q table with a neural network: The network must take the state as input and produce a Q value for each action.¶

Manage transitions: Store experiences (state, action, reward, new state) in a replay memory for network training.¶

Batch Training: Use mini batches of experiments to train the network to minimize the loss function, often calculated as the squared error between the predicted Q value and the updated Q target.¶

Here is a basic example for a DQN:¶

Display initial state¶

A* ¶

Breadth First Search ¶

Early srop¶

Dijkstra’s Algorithm (or Uniform Cost Search) ¶

Changes Made:¶

Implemented A Algorithm: Modified the BFS/Dijkstra's to A using a priority queue with a heuristic function.¶

Heuristic Function: Added a Manhattan distance heuristic function.¶

Adjusted Animation: Ensured the animation uses the A* steps and reconstructs the path correctly.¶

Here we are : Implementing Q* ¶

Dataset Pipe Line¶

Dataset¶

$\boxed{ 75}$¶

NB : 36% ¶

Let's try other metrics¶

Evaluate llm without Q*¶

rougeLsum': 0.6176895748414735}¶

Q * alignement¶

1st way ¶

2nd way ¶

to plug to :¶

to plug in¶

[26]¶

[27]¶

3rd way ¶

Reward function¶

In particular, we assign a reward of 1 if the generated code passes all test cases ( for code generation¶

We will use following way :¶

Compute Reward(s0, state,action) ¶

Rolling out ¶

roll out s0 = question¶

contruct trajecory¶

for each element reward (state,action)¶

keep best reward trajectory¶

Initialization: The agent initializes the Q table with zero values for each state-action pair.¶