From e57f81beccb85ad462d7841bf308d30b87068a76 Mon Sep 17 00:00:00 2001 From: Tianhong Dai Date: Wed, 17 Oct 2018 00:22:06 +0100 Subject: [PATCH 1/6] revise the readme of ddpg (#653) --- baselines/ddpg/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baselines/ddpg/README.md b/baselines/ddpg/README.md index 6e936dd..ed6d23f 100755 --- a/baselines/ddpg/README.md +++ b/baselines/ddpg/README.md @@ -2,4 +2,4 @@ - Original paper: https://arxiv.org/abs/1509.02971 - Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/ -- `python -m baselines.ddpg.main` runs the algorithm for 1M frames = 10M timesteps on a Mujoco environment. See help (`-h`) for more options. \ No newline at end of file +- `python -m baselines.run --alg=ddpg --env=HalfCheetah-v2 --num_timesteps=1e6` runs the algorithm for 1M frames = 10M timesteps on a Mujoco environment. See help (`-h`) for more options. From a03dacd68d6fe451670f63d6d0cc204aeb72e5dd Mon Sep 17 00:00:00 2001 From: pzhokhov Date: Tue, 16 Oct 2018 16:26:46 -0700 Subject: [PATCH 2/6] sync internal changes. Make ddpg work with vecenvs (#654) * sync internal changes. Make ddpg work with vecenvs * B -> nenvs for consistency with other algos, small cleanups * eval_done[d]==True -> eval_done[d] * flake8 and numpy.random.random_integers deprecation warning --- baselines/common/tests/test_identity.py | 2 +- baselines/common/tf_util.py | 2 +- baselines/common/vec_env/__init__.py | 1 + baselines/ddpg/ddpg.py | 73 ++++++++++++++++--------- baselines/ddpg/ddpg_learner.py | 13 +++-- baselines/ddpg/memory.py | 2 +- 6 files changed, 59 insertions(+), 34 deletions(-) diff --git a/baselines/common/tests/test_identity.py b/baselines/common/tests/test_identity.py index 8624df1..744ed83 100644 --- a/baselines/common/tests/test_identity.py +++ b/baselines/common/tests/test_identity.py @@ -14,7 +14,7 @@ learn_kwargs = { 'a2c' : {}, 'acktr': {}, 'deepq': {}, - 'ddpg': dict(nb_epochs=None, layer_norm=True), + 'ddpg': dict(layer_norm=True), 'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0), 'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01) } diff --git a/baselines/common/tf_util.py b/baselines/common/tf_util.py index b293975..2699035 100644 --- a/baselines/common/tf_util.py +++ b/baselines/common/tf_util.py @@ -293,7 +293,7 @@ def display_var_info(vars): if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue v_params = np.prod(v.shape.as_list()) count_params += v_params - if "/b:" in name or "/biases" in name: continue # Wx+b, bias is not interesting to look at => count params, but not print + if "/b:" in name or "/bias" in name: continue # Wx+b, bias is not interesting to look at => count params, but not print logger.info(" %s%s %i params %s" % (name, " "*(55-len(name)), v_params, str(v.shape))) logger.info("Total model parameters: %0.2f million" % (count_params*1e-6)) diff --git a/baselines/common/vec_env/__init__.py b/baselines/common/vec_env/__init__.py index b3cdca7..cb60531 100644 --- a/baselines/common/vec_env/__init__.py +++ b/baselines/common/vec_env/__init__.py @@ -104,6 +104,7 @@ class VecEnv(ABC): bigimg = tile_images(imgs) if mode == 'human': self.get_viewer().imshow(bigimg) + return self.get_viewer().isopen elif mode == 'rgb_array': return bigimg else: diff --git a/baselines/ddpg/ddpg.py b/baselines/ddpg/ddpg.py index b272aea..181f923 100755 --- a/baselines/ddpg/ddpg.py +++ b/baselines/ddpg/ddpg.py @@ -78,6 +78,7 @@ def learn(network, env, max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) + agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, @@ -94,16 +95,21 @@ def learn(network, env, sess.graph.finalize() agent.reset() + obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() - done = False - episode_reward = 0. - episode_step = 0 - episodes = 0 - t = 0 + nenvs = obs.shape[0] + + episode_reward = np.zeros(nenvs, dtype = np.float32) #vector + episode_step = np.zeros(nenvs, dtype = int) # vector + episodes = 0 #scalar + t = 0 # scalar epoch = 0 + + + start_time = time.time() epoch_episode_rewards = [] @@ -114,16 +120,22 @@ def learn(network, env, for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. + if nenvs > 1: + # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each + # of the environments, so resetting here instead + agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) - assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() - assert max_action.shape == action.shape + + # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) + # note these outputs are batched from vecenv + t += 1 if rank == 0 and render: env.render() @@ -133,21 +145,24 @@ def learn(network, env, # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) - agent.store_transition(obs, action, r, new_obs, done) + agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. + obs = new_obs - if done: - # Episode done. - epoch_episode_rewards.append(episode_reward) - episode_rewards_history.append(episode_reward) - epoch_episode_steps.append(episode_step) - episode_reward = 0. - episode_step = 0 - epoch_episodes += 1 - episodes += 1 + for d in range(len(done)): + if done[d]: + # Episode done. + epoch_episode_rewards.append(episode_reward[d]) + episode_rewards_history.append(episode_reward[d]) + epoch_episode_steps.append(episode_step[d]) + episode_reward[d] = 0. + episode_step[d] = 0 + epoch_episodes += 1 + episodes += 1 + if nenvs == 1: + agent.reset() + - agent.reset() - obs = env.reset() # Train. epoch_actor_losses = [] @@ -168,7 +183,8 @@ def learn(network, env, eval_episode_rewards = [] eval_qs = [] if eval_env is not None: - eval_episode_reward = 0. + nenvs_eval = eval_obs.shape[0] + eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) @@ -177,11 +193,11 @@ def learn(network, env, eval_episode_reward += eval_r eval_qs.append(eval_q) - if eval_done: - eval_obs = eval_env.reset() - eval_episode_rewards.append(eval_episode_reward) - eval_episode_rewards_history.append(eval_episode_reward) - eval_episode_reward = 0. + for d in range(len(eval_done)): + if eval_done[d]: + eval_episode_rewards.append(eval_episode_reward[d]) + eval_episode_rewards_history.append(eval_episode_reward[d]) + eval_episode_reward[d] = 0.0 mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. @@ -216,7 +232,8 @@ def learn(network, env, return x else: raise ValueError('expected scalar, got %s'%x) - combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) + + combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([ np.array(x).flatten()[0] for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. @@ -225,7 +242,9 @@ def learn(network, env, for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) - logger.dump_tabular() + + if rank == 0: + logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: diff --git a/baselines/ddpg/ddpg_learner.py b/baselines/ddpg/ddpg_learner.py index cfaa290..44f231f 100755 --- a/baselines/ddpg/ddpg_learner.py +++ b/baselines/ddpg/ddpg_learner.py @@ -265,19 +265,24 @@ class DDPG(object): else: action = self.sess.run(actor_tf, feed_dict=feed_dict) q = None - action = action.flatten() + if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) + + return action, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale - self.memory.append(obs0, action, reward, obs1, terminal1) - if self.normalize_observations: - self.obs_rms.update(np.array([obs0])) + + B = obs0.shape[0] + for b in range(B): + self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) + if self.normalize_observations: + self.obs_rms.update(np.array([obs0[b]])) def train(self): # Get a batch. diff --git a/baselines/ddpg/memory.py b/baselines/ddpg/memory.py index 781fa71..715f4c0 100755 --- a/baselines/ddpg/memory.py +++ b/baselines/ddpg/memory.py @@ -51,7 +51,7 @@ class Memory(object): def sample(self, batch_size): # Draw such that we always have a proceeding element. - batch_idxs = np.random.random_integers(self.nb_entries - 2, size=batch_size) + batch_idxs = np.random.randint(self.nb_entries - 2, size=batch_size) obs0_batch = self.observations0.get_batch(batch_idxs) obs1_batch = self.observations1.get_batch(batch_idxs) From ef96f3835b34ae59ba6d668ffa3befeed60af6fa Mon Sep 17 00:00:00 2001 From: Matthew Rahtz Date: Tue, 16 Oct 2018 16:28:23 -0700 Subject: [PATCH 3/6] Drop S and M args so that --play works (#636) --- baselines/deepq/deepq.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/baselines/deepq/deepq.py b/baselines/deepq/deepq.py index 47fe19a..c6004b2 100644 --- a/baselines/deepq/deepq.py +++ b/baselines/deepq/deepq.py @@ -47,6 +47,9 @@ class ActWrapper(object): return self._act(*args, **kwargs) def step(self, observation, **kwargs): + # DQN doesn't use RNNs so we ignore states and masks + kwargs.pop('S', None) + kwargs.pop('M', None) return self._act([observation], **kwargs), None, None, None def save_act(self, path=None): From 3677dc1b23c344a746c41f1929d7ae88c18afe1b Mon Sep 17 00:00:00 2001 From: Matthew Rahtz Date: Thu, 18 Oct 2018 13:54:39 -0700 Subject: [PATCH 4/6] Set allow_growth=True for MuJoCo session (#643) --- baselines/run.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/baselines/run.py b/baselines/run.py index 5dee154..8ab71ac 100644 --- a/baselines/run.py +++ b/baselines/run.py @@ -121,9 +121,11 @@ def build_env(args): env = retro_wrappers.wrap_deepmind_retro(env) else: - get_session(tf.ConfigProto(allow_soft_placement=True, - intra_op_parallelism_threads=1, - inter_op_parallelism_threads=1)) + config = tf.ConfigProto(allow_soft_placement=True, + intra_op_parallelism_threads=1, + inter_op_parallelism_threads=1) + config.gpu_options.allow_growth = True + get_session(config=config) env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale) From fc7f9cec49786d50bb635228da0062972bdcf967 Mon Sep 17 00:00:00 2001 From: pzhokhov Date: Thu, 18 Oct 2018 16:07:14 -0700 Subject: [PATCH 5/6] disable gym subpackages in setup.py (#661) * disable gym subpackages in setup.py * include gym[atari] in test requirements * gym[atari] -> atari-py in test requirements --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 5ec1fce..726c6a3 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,8 @@ if sys.version_info.major != 3: extras = { 'test': [ 'filelock', - 'pytest' + 'pytest', + 'atari-py' ], 'bullet': [ 'pybullet', @@ -27,7 +28,7 @@ setup(name='baselines', packages=[package for package in find_packages() if package.startswith('baselines')], install_requires=[ - 'gym[mujoco,atari,classic_control,robotics]', + 'gym', 'scipy', 'tqdm', 'joblib', From d0cc325e1414b56bcc6cbd90bbc1778d49a3e950 Mon Sep 17 00:00:00 2001 From: pzhokhov Date: Fri, 19 Oct 2018 08:54:21 -0700 Subject: [PATCH 6/6] store session at policy creation time (#655) * sync internal changes. Make ddpg work with vecenvs * B -> nenvs for consistency with other algos, small cleanups * eval_done[d]==True -> eval_done[d] * flake8 and numpy.random.random_integers deprecation warning * store session at policy creation time * coexistence tests * fix a typo * autopep8 * ... and flake8 * updated todo links in test_serialization --- baselines/common/policies.py | 4 +-- baselines/common/tests/envs/mnist_env.py | 2 +- baselines/common/tests/test_serialization.py | 38 +++++++++++++++++++- 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/baselines/common/policies.py b/baselines/common/policies.py index eeac242..9c9bb8b 100644 --- a/baselines/common/policies.py +++ b/baselines/common/policies.py @@ -53,7 +53,7 @@ class PolicyWithValue(object): # Calculate the neg log of our probability self.neglogp = self.pd.neglogp(self.action) - self.sess = sess + self.sess = sess or tf.get_default_session() if estimate_q: assert isinstance(env.action_space, gym.spaces.Discrete) @@ -64,7 +64,7 @@ class PolicyWithValue(object): self.vf = self.vf[:,0] def _evaluate(self, variables, observation, **extra_feed): - sess = self.sess or tf.get_default_session() + sess = self.sess feed_dict = {self.X: adjust_shape(self.X, observation)} for inpt_name, data in extra_feed.items(): if inpt_name in self.__dict__.keys(): diff --git a/baselines/common/tests/envs/mnist_env.py b/baselines/common/tests/envs/mnist_env.py index 4f73495..473008d 100644 --- a/baselines/common/tests/envs/mnist_env.py +++ b/baselines/common/tests/envs/mnist_env.py @@ -1,7 +1,6 @@ import os.path as osp import numpy as np import tempfile -import filelock from gym import Env from gym.spaces import Discrete, Box @@ -14,6 +13,7 @@ class MnistEnv(Env): episode_len=None, no_images=None ): + import filelock from tensorflow.examples.tutorials.mnist import input_data # we could use temporary directory for this with a context manager and # TemporaryDirecotry, but then each test that uses mnist would re-download the data diff --git a/baselines/common/tests/test_serialization.py b/baselines/common/tests/test_serialization.py index 4086f2b..f46b578 100644 --- a/baselines/common/tests/test_serialization.py +++ b/baselines/common/tests/test_serialization.py @@ -1,4 +1,5 @@ import os +import gym import tempfile import pytest import tensorflow as tf @@ -39,7 +40,7 @@ def test_serialization(learn_fn, network_fn): if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']: # TODO make acktr work with recurrent policies # and test - # github issue: https://github.com/openai/baselines/issues/194 + # github issue: https://github.com/openai/baselines/issues/660 return env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)]) @@ -75,6 +76,41 @@ def test_serialization(learn_fn, network_fn): np.testing.assert_allclose(std1, std2, atol=0.5) +@pytest.mark.parametrize("learn_fn", learn_kwargs.keys()) +@pytest.mark.parametrize("network_fn", ['mlp']) +def test_coexistence(learn_fn, network_fn): + ''' + Test if more than one model can exist at a time + ''' + + if learn_fn == 'deepq': + # TODO enable multiple DQN models to be useable at the same time + # github issue https://github.com/openai/baselines/issues/656 + return + + if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']: + # TODO make acktr work with recurrent policies + # and test + # github issue: https://github.com/openai/baselines/issues/660 + return + + env = DummyVecEnv([lambda: gym.make('CartPole-v0')]) + learn = get_learn_function(learn_fn) + + kwargs = {} + kwargs.update(network_kwargs[network_fn]) + kwargs.update(learn_kwargs[learn_fn]) + + learn = partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs) + make_session(make_default=True, graph=tf.Graph()); + model1 = learn(seed=1) + make_session(make_default=True, graph=tf.Graph()); + model2 = learn(seed=2) + + model1.step(env.observation_space.sample()) + model2.step(env.observation_space.sample()) + + def _serialize_variables(): sess = get_session()