Fix scoping of variables (#670)

Co-authored-by: Martin van Hensbergen <martin@mvhensbergen.com>
2025-07-31 22:04:31 +00:00 · 2023-08-13 18:03:05 +02:00
parent f26cbe13e9
commit 7012d22902
1 changed files with 5 additions and 3 deletions
--- a/docs/tutorials/training_agents/blackjack_tutorial.py
+++ b/docs/tutorials/training_agents/blackjack_tutorial.py
@@ -176,6 +176,7 @@ observation, reward, terminated, truncated, info = env.step(action)
 class BlackjackAgent:
    def __init__(
        self,
+        env,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
@@ -203,7 +204,7 @@ class BlackjackAgent:

        self.training_error = []

-    def get_action(self, obs: tuple[int, int, bool]) -> int:
+    def get_action(self, env, obs: tuple[int, int, bool]) -> int:
        """
        Returns the best action with probability (1 - epsilon)
        otherwise a random action with probability epsilon to ensure exploration.
@@ -236,7 +237,7 @@ class BlackjackAgent:
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
-        self.epsilon = max(self.final_epsilon, self.epsilon - epsilon_decay)
+        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)


 # %%
@@ -258,6 +259,7 @@ epsilon_decay = start_epsilon / (n_episodes / 2)  # reduce the exploration over
 final_epsilon = 0.1

 agent = BlackjackAgent(
+    env=env,
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
@@ -280,7 +282,7 @@ for episode in tqdm(range(n_episodes)):

    # play one episode
    while not done:
-        action = agent.get_action(obs)
+        action = agent.get_action(env, obs)
        next_obs, reward, terminated, truncated, info = env.step(action)

        # update the agent