diff --git a/baselines/a2c/README.md b/baselines/a2c/README.md
index 915852b..9450054 100644
--- a/baselines/a2c/README.md
+++ b/baselines/a2c/README.md
@@ -4,3 +4,10 @@
 - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
 - `python -m baselines.run --alg=a2c --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options
 - also refer to the repo-wide [README.md](../../README.md#training-models)
+
+## Files
+- `run_atari`: file used to run the algorithm.
+- `policies.py`: contains the different versions of the A2C architecture (MlpPolicy, CNNPolicy, LstmPolicy...).
+- `a2c.py`: - Model : class used to initialize the step_model (sampling) and train_model (training)
+	- learn : Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.
+- `runner.py`: class used to generates a batch of experiences
diff --git a/baselines/a2c/a2c.py b/baselines/a2c/a2c.py
index d085040..43717de 100644
--- a/baselines/a2c/a2c.py
+++ b/baselines/a2c/a2c.py
@@ -16,6 +16,18 @@ from tensorflow import losses
 
 class Model(object):
 
+    """
+    We use this class to :
+        __init__:
+        - Creates the step_model
+        - Creates the train_model
+
+        train():
+        - Make the training part (feedforward and retropropagation of gradients)
+
+        save/load():
+        - Save load the model
+    """
     def __init__(self, policy, env, nsteps,
             ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
             alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
@@ -26,7 +38,10 @@ class Model(object):
 
 
         with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
+            # step_model is used for sampling
             step_model = policy(nenvs, 1, sess)
+
+            # train_model is used to train our network
             train_model = policy(nbatch, nsteps, sess)
 
         A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
@@ -34,25 +49,48 @@ class Model(object):
         R = tf.placeholder(tf.float32, [nbatch])
         LR = tf.placeholder(tf.float32, [])
 
+        # Calculate the loss
+        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
+
+        # Policy loss
+        # Output -log(pi)
         neglogpac = train_model.pd.neglogp(A)
+
+        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
         entropy = tf.reduce_mean(train_model.pd.entropy())
 
+        # 1/n * sum A(si,ai) * -logpi(ai|si)
         pg_loss = tf.reduce_mean(ADV * neglogpac)
+
+        # Value loss 1/2 SUM [R - V(s)]^2
         vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)
 
         loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
 
+        # Update parameters using loss
+        # 1. Get the model parameters
         params = find_trainable_variables("a2c_model")
+
+        # 2. Calculate the gradients
         grads = tf.gradients(loss, params)
         if max_grad_norm is not None:
+            # Clip the gradients (normalize)
             grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
         grads = list(zip(grads, params))
+        # zip aggregate each gradient with parameters associated
+        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da
+
+        # 3. Build our trainer
         trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
+
+        # 4. Backpropagation
         _train = trainer.apply_gradients(grads)
 
         lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
 
         def train(obs, states, rewards, masks, actions, values):
+            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
+            # rewards = R + yV(s')
             advs = rewards - values
             for step in range(len(obs)):
                 cur_lr = lr.value()
@@ -148,23 +186,47 @@ def learn(
 
     set_global_seeds(seed)
 
+    # Get the nb of env
     nenvs = env.num_envs
     policy = build_policy(env, network, **network_kwargs)
 
+    # Instantiate the model object (that creates step_model and train_model)
     model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
         max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
     if load_path is not None:
         model.load(load_path)
+
+    # Instantiate the runner object
     runner = Runner(env, model, nsteps=nsteps, gamma=gamma)
 
+    # Calculate the batch_size
     nbatch = nenvs*nsteps
+
+    # Start total timer
     tstart = time.time()
+
     for update in range(1, total_timesteps//nbatch+1):
+        # Get mini batch of experiences
         obs, states, rewards, masks, actions, values = runner.run()
+
         policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
         nseconds = time.time()-tstart
+
+        # Calculate the fps (frame per second)
         fps = int((update*nbatch)/nseconds)
         if update % log_interval == 0 or update == 1:
+            """
+            explained_variances calculates if value function is a good
+            predicator of the returns or if it's just worse than predicting
+            nothing.
+            The goal is that ev goes closer and closer to 1.
+
+
+            interpretation:
+            ev=0  =>  might as well have predicted zero
+            ev=1  =>  perfect prediction
+            ev<0  =>  worse than just predicting zero
+            """
             ev = explained_variance(values, rewards)
             logger.record_tabular("nupdates", update)
             logger.record_tabular("total_timesteps", update*nbatch)
diff --git a/baselines/a2c/runner.py b/baselines/a2c/runner.py
index f03e0d9..a3fd554 100644
--- a/baselines/a2c/runner.py
+++ b/baselines/a2c/runner.py
@@ -3,7 +3,15 @@ from baselines.a2c.utils import discount_with_dones
 from baselines.common.runners import AbstractEnvRunner
 
 class Runner(AbstractEnvRunner):
+    """
+    We use this class to generate batches of experiences
 
+    __init__:
+    - Initialize the runner
+
+    run():
+    - Make a mini batch of experiences
+    """
     def __init__(self, env, model, nsteps=5, gamma=0.99):
         super().__init__(env=env, model=model, nsteps=nsteps)
         self.gamma = gamma
@@ -11,14 +19,21 @@ class Runner(AbstractEnvRunner):
         self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
 
     def run(self):
+        # We initialize the lists that will contain the mb of experiences
         mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
         mb_states = self.states
         for n in range(self.nsteps):
+             # Given observations, take action and value (V(s))
+            # We already have self.obs because AbstractEnvRunner run self.obs[:] = env.reset()
             actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones)
+
+            # Append the experiences
             mb_obs.append(np.copy(self.obs))
             mb_actions.append(actions)
             mb_values.append(values)
             mb_dones.append(self.dones)
+
+            # Take actions in env and look the results
             obs, rewards, dones, _ = self.env.step(actions)
             self.states = states
             self.dones = dones
@@ -28,8 +43,8 @@ class Runner(AbstractEnvRunner):
             self.obs = obs
             mb_rewards.append(rewards)
         mb_dones.append(self.dones)
-        #batch of steps to batch of rollouts
 
+        # Batch of steps to batch of rollouts
         mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape)
         mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
         mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
@@ -40,7 +55,7 @@ class Runner(AbstractEnvRunner):
 
 
         if self.gamma > 0.0:
-            #discount/bootstrap off value fn
+            # Discount/bootstrap off value fn
             last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist()
             for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
                 rewards = rewards.tolist()
diff --git a/baselines/common/policies.py b/baselines/common/policies.py
index 6071ad2..1e70b81 100644
--- a/baselines/common/policies.py
+++ b/baselines/common/policies.py
@@ -43,11 +43,17 @@ class PolicyWithValue(object):
         vf_latent = tf.layers.flatten(vf_latent)
         latent = tf.layers.flatten(latent)
 
+        # Based on the action space, will select what probability distribution type
         self.pdtype = make_pdtype(env.action_space)
 
+        # This build a fc connected layer that returns a probability distribution
+        # over actions (self.pd) and our pi logits (self.pi).
         self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)
 
+        # Take an action
         self.action = self.pd.sample()
+
+        # Calculate the neg log of our probability
         self.neglogp = self.pd.neglogp(self.action)
         self.sess = sess
 
diff --git a/baselines/ppo2/ppo2.py b/baselines/ppo2/ppo2.py
index 0ceee8e..ff148f2 100644
--- a/baselines/ppo2/ppo2.py
+++ b/baselines/ppo2/ppo2.py
@@ -17,50 +17,112 @@ from baselines.common.tf_util import initialize
 from baselines.common.mpi_util import sync_from_root
 
 class Model(object):
+    """
+    We use this object to :
+    __init__:
+    - Creates the step_model
+    - Creates the train_model
+
+    train():
+    - Make the training part (feedforward and retropropagation of gradients)
+
+    save/load():
+    - Save load the model
+    """
     def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm):
         sess = get_session()
 
         with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
+            # CREATE OUR TWO MODELS
+            # act_model that is used for sampling
             act_model = policy(nbatch_act, 1, sess)
+
+            # Train model for training
             train_model = policy(nbatch_train, nsteps, sess)
 
+        # CREATE THE PLACEHOLDERS
         A = train_model.pdtype.sample_placeholder([None])
         ADV = tf.placeholder(tf.float32, [None])
         R = tf.placeholder(tf.float32, [None])
+        # Keep track of old actor
         OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
+        # Keep track of old critic
         OLDVPRED = tf.placeholder(tf.float32, [None])
         LR = tf.placeholder(tf.float32, [])
+        # Cliprange
         CLIPRANGE = tf.placeholder(tf.float32, [])
 
         neglogpac = train_model.pd.neglogp(A)
+
+        # Calculate the entropy
+        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
         entropy = tf.reduce_mean(train_model.pd.entropy())
 
+        # CALCULATE THE LOSS
+        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
+
+        # Clip the value
+        # Get the value predicted
         vpred = train_model.vf
+         # Clip the value = Oldvalue + clip(value - oldvalue, min = - cliprange, max = cliprange)
         vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
+        # Unclipped value
         vf_losses1 = tf.square(vpred - R)
+        # Clipped value
         vf_losses2 = tf.square(vpredclipped - R)
+        # Value loss 0.5 * SUM [max(unclipped, clipped)
         vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
+
+        # Remember we want ratio (pi current policy / pi old policy)
+        # But neglopac returns us -log(policy)
+        # So we want to transform it into ratio
+        # e^(-log old - (-log new)) == e^(log new - log old) == e^(log(new / old))
+        # = new/old (since exponential function cancels log)
         ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
+
+        # Remember also that we're doing gradient ascent, aka we want to MAXIMIZE the objective function which is equivalent to say
+        # Loss = - J
+        # To make objective function negative we can put a negation on the multiplication (pi new / pi old) * - Advantages
         pg_losses = -ADV * ratio
+
+        # value, min [1 - e] , max [1 + e]
         pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
+
+        # Final PG loss
+        # Why maximum, because pg_loss_unclipped and pg_loss_clipped are negative, getting the min of positive elements = getting
+        # the max of negative elements
         pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
         approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
         clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
+
+        # Total loss (Remember that L = - J because it's the same thing than max J
         loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
+
+        # UPDATE THE PARAMETERS USING LOSS
+        # 1. Get the model parameters
         params = tf.trainable_variables('ppo2_model')
+        # 2. Build our trainer
         trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
+        # 3. Calculate the gradients
         grads_and_var = trainer.compute_gradients(loss, params)
         grads, var = zip(*grads_and_var)
 
         if max_grad_norm is not None:
+            # Clip the gradients (normalize)
             grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
         grads_and_var = list(zip(grads, var))
-
+        # zip aggregate each gradient with parameters associated
+        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da
+        # 4. Backpropagation
         _train = trainer.apply_gradients(grads_and_var)
 
         def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
+            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
+            # Returns = R + yV(s')
             advs = returns - values
+
+            # Normalize the advantages
             advs = (advs - advs.mean()) / (advs.std() + 1e-8)
             td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
                     CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
@@ -90,23 +152,39 @@ class Model(object):
         sync_from_root(sess, global_variables) #pylint: disable=E1101
 
 class Runner(AbstractEnvRunner):
+    """
+    We use this object to make a mini batch of experiences
+    __init__:
+    - Initialize the runner
 
+    run():
+    - Make a mini batch
+    """
     def __init__(self, *, env, model, nsteps, gamma, lam):
         super().__init__(env=env, model=model, nsteps=nsteps)
+        # Lambda used in GAE (General Advantage Estimation)
         self.lam = lam
+        # Discount rate
         self.gamma = gamma
 
     def run(self):
+        # Here, we init the lists that will contain the mb of experiences
         mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
         mb_states = self.states
         epinfos = []
+        # For n in range number of steps
         for _ in range(self.nsteps):
+            # Given observations, get action value and neglopacs
+            # We already have self.obs because AbstractEnvRunner run self.obs[:] = env.reset()
             actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
             mb_obs.append(self.obs.copy())
             mb_actions.append(actions)
             mb_values.append(values)
             mb_neglogpacs.append(neglogpacs)
             mb_dones.append(self.dones)
+
+            # Take actions in env and look the results
+            # Infos contains a ton of useful informations
             self.obs[:], rewards, self.dones, infos = self.env.step(actions)
             for info in infos:
                 maybeepinfo = info.get('episode')
@@ -120,19 +198,36 @@ class Runner(AbstractEnvRunner):
         mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
         mb_dones = np.asarray(mb_dones, dtype=np.bool)
         last_values = self.model.value(self.obs, S=self.states, M=self.dones)
-        #discount/bootstrap off value fn
+
+        ### GENERALIZED ADVANTAGE ESTIMATION
+        # discount/bootstrap off value fn
+        # We create mb_returns and mb_advantages
+        # mb_returns will contain Advantage + value
         mb_returns = np.zeros_like(mb_rewards)
         mb_advs = np.zeros_like(mb_rewards)
         lastgaelam = 0
+        # From last step to first step
         for t in reversed(range(self.nsteps)):
+            # If t == before last step
             if t == self.nsteps - 1:
+                # If a state is done, nextnonterminal = 0
+                # In fact nextnonterminal allows us to do that logic
+
+                #if done (so nextnonterminal = 0):
+                #    delta = R - V(s) (because self.gamma * nextvalues * nextnonterminal = 0)
+                # else (not done)
+                #delta = R + gamma * V(st+1)
                 nextnonterminal = 1.0 - self.dones
+                # V(t+1)
                 nextvalues = last_values
             else:
                 nextnonterminal = 1.0 - mb_dones[t+1]
                 nextvalues = mb_values[t+1]
+            # Delta = R(st) + gamma * V(t+1) * nextnonterminal  - V(st)
             delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
+            # Advantage = delta + gamma *  λ (lambda) * nextnonterminal  * lastgaelam
             mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
+        # Returns
         mb_returns = mb_advs + mb_values
         return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
             mb_states, epinfos)
@@ -218,37 +313,56 @@ def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0
 
     policy = build_policy(env, network, **network_kwargs)
 
+    # Get the nb of env
     nenvs = env.num_envs
+
+    # Get state_space and action_space
     ob_space = env.observation_space
     ac_space = env.action_space
+
+    # Calculate the batch_size
     nbatch = nenvs * nsteps
     nbatch_train = nbatch // nminibatches
 
+    # Instantiate the model object (that creates act_model and train_model)
     make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train,
                     nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
                     max_grad_norm=max_grad_norm)
     model = make_model()
     if load_path is not None:
         model.load(load_path)
+    # Instantiate the runner object
     runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)
 
     epinfobuf = deque(maxlen=100)
+
+    # Start total timer
     tfirststart = time.time()
 
     nupdates = total_timesteps//nbatch
     for update in range(1, nupdates+1):
         assert nbatch % nminibatches == 0
+        # Start timer
         tstart = time.time()
         frac = 1.0 - (update - 1.0) / nupdates
+        # Calculate the learning rate
         lrnow = lr(frac)
+        # Calculate the cliprange
         cliprangenow = cliprange(frac)
+        # Get minibatch
         obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632
         epinfobuf.extend(epinfos)
+
+        # Here what we're going to do is for each minibatch calculate the loss and append it.
         mblossvals = []
         if states is None: # nonrecurrent version
+            # Index of each element of batch_size
+            # Create the indices array
             inds = np.arange(nbatch)
             for _ in range(noptepochs):
+                # Randomize the indexes
                 np.random.shuffle(inds)
+                # 0 to batch_size with batch_train_size step
                 for start in range(0, nbatch, nbatch_train):
                     end = start + nbatch_train
                     mbinds = inds[start:end]
@@ -270,10 +384,21 @@ def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0
                     mbstates = states[mbenvinds]
                     mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates))
 
+        # Feedforward --> get losses --> update
         lossvals = np.mean(mblossvals, axis=0)
+        # End timer
         tnow = time.time()
+        # Calculate the fps (frame per second)
         fps = int(nbatch / (tnow - tstart))
         if update % log_interval == 0 or update == 1:
+            """
+            Computes fraction of variance that ypred explains about y.
+            Returns 1 - Var[y-ypred] / Var[y]
+            interpretation:
+            ev=0  =>  might as well have predicted zero
+            ev=1  =>  perfect prediction
+            ev<0  =>  worse than just predicting zero
+            """
             ev = explained_variance(values, returns)
             logger.logkv("serial_timesteps", update*nsteps)
             logger.logkv("nupdates", update)
@@ -294,7 +419,7 @@ def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0
             print('Saving to', savepath)
             model.save(savepath)
     return model
-
+# Avoid division error when calculate the mean (in our case if epinfo is empty returns np.nan, not return an error)
 def safemean(xs):
     return np.nan if len(xs) == 0 else np.mean(xs)