1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
| import random from collections import deque
import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torch import optim from Utils.Noise import GaussianNoise
class Actor(nn.Module): def __init__(self, state_dim, action_dim, max_action, hidden_dim = 256): super(Actor, self).__init__() self.net = nn.Sequential( nn.Linear(state_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, action_dim), nn.Sigmoid(), ) self.max_action = max_action
def forward(self, state): return self.net(state) * self.max_action
class Critic(nn.Module): def __init__(self, critic_input_dim, hidden_dim = 256): super(Critic, self).__init__() self.net = nn.Sequential( nn.Linear(critic_input_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, 1), )
def forward(self, obs, action): return self.net(torch.cat((obs, action), dim=1))
class MADDPGAgent(): def __init__(self, obs_dim, action_dim, critic_input_dim, max_action, device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu', actor_lr = 3e-4, critic_lr = 1e-4, hidden_dim = 256, tau = 0.001): self.obs_dim = obs_dim self.action_dim = action_dim self.critic_input_dim = critic_input_dim self.hidden_dim = hidden_dim self.max_action = max_action self.device = device self.tau = tau self.actor = Actor(self.obs_dim, self.action_dim, self.max_action, self.hidden_dim).to(device) self.critic = Critic(self.critic_input_dim, self.hidden_dim).to(device)
self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)
self.target_actor = Actor(self.obs_dim, self.action_dim, self.max_action, self.hidden_dim).to(device) self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic = Critic(self.critic_input_dim, self.hidden_dim).to(device) self.target_critic.load_state_dict(self.critic.state_dict())
def act(self, obs, noise_std=0.0): obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(self.device) with torch.no_grad(): action = self.actor(obs_tensor).cpu().numpy()[0]
noise = GaussianNoise(action_dim=self.action_dim, sigma=noise_std) action = action + noise.sample()
return np.clip(action, 0, self.max_action).astype(np.float32)
def update_target(self): for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
class ReplayBuffer(): def __init__(self, max_size=10000): self.memory = deque(maxlen=max_size)
def store(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done))
def sample(self, batch_size=512): if len(self.memory) < batch_size: batch = random.sample(self.memory, len(self.memory)) else: batch = random.sample(self.memory, batch_size)
obs_n, act_n, rew_n, next_obs_n, done_n = zip(*batch)
def transpose_stack(x): return [np.stack(agent_data) for agent_data in zip(*x)]
return ( transpose_stack(obs_n), transpose_stack(act_n), transpose_stack(rew_n), transpose_stack(next_obs_n), transpose_stack(done_n) )
def __len__(self): return len(self.memory)
class MADDPG(): def __init__(self, env, n_agent, device = 'cuda' if torch.cuda.is_available() else 'cpu', actor_lr = 3e-4, critic_lr = 1e-4, hidden_dim = 256, batch_size = 512, gamma = 0.99, tau = 0.001, replay_buffer_size = 50000): self.agents = {} self.agent_names = env.agents self.n_agent = n_agent self.device = device self.actor_lr = actor_lr self.critic_lr = critic_lr self.hidden_dim = hidden_dim self.batch_size = batch_size self.gamma = gamma self.tau = tau self.replay_buffer = ReplayBuffer(max_size=replay_buffer_size)
env.reset() global_critic_dim = 0 for agent in self.agent_names: obs_dim = env.observation_space(agent).shape[0] act_dim = env.action_space(agent).shape[0] global_critic_dim += obs_dim + act_dim
for agent_id in self.agent_names: obs_dim = env.observation_space(agent_id).shape[0] act_dim = env.action_space(agent_id).shape[0] max_action = env.action_space(agent_id).high[0]
self.agents[agent_id] = MADDPGAgent(obs_dim, act_dim, global_critic_dim, max_action)
def store_transition(self, obs, action, reward, next_obs, done): obs_n, act_n, reward_n, next_obs_n, done_n = [], [], [], [], [] for agent_id in self.agent_names: obs_n.append(obs[agent_id]) act_n.append(action[agent_id]) reward_n.append(reward[agent_id]) next_obs_n.append(next_obs[agent_id]) done_n.append(done[agent_id]) self.replay_buffer.store(obs_n, act_n, reward_n, next_obs_n, done_n)
def step(self, obs, noise_std = 0.1): actions = {} for agent_id in self.agent_names: actions[agent_id] = self.agents[agent_id].act(obs[agent_id], noise_std) return actions
def train(self): if self.replay_buffer.__len__() < self.batch_size: return obs_n, act_n, reward_n, next_obs_n, done_n = self.replay_buffer.sample(batch_size=self.batch_size) obs_n = [torch.tensor(o, dtype=torch.float32).to(self.device) for o in obs_n] act_n = [torch.tensor(a, dtype=torch.float32).to(self.device) for a in act_n] reward_n = [torch.tensor(r, dtype=torch.float32).unsqueeze(1).to(self.device) for r in reward_n] next_obs_n = [torch.tensor(no, dtype=torch.float32).to(self.device) for no in next_obs_n] done_n = [torch.tensor(d, dtype=torch.float32).unsqueeze(1).to(self.device) for d in done_n] next_act_n = [] with torch.no_grad(): for i, agent_name in enumerate(self.agent_names): agent = self.agents[agent_name] target_act = agent.target_actor(next_obs_n[i]) next_act_n.append(target_act) target_critic_obs = torch.cat(next_obs_n, dim=1) target_critic_act = torch.cat(next_act_n, dim=1)
current_critic_obs = torch.cat(obs_n, dim=1) current_critic_act = torch.cat(act_n, dim=1)
for i, agent_name in enumerate(self.agent_names): agent = self.agents[agent_name] with torch.no_grad(): target_q_next = agent.target_critic(target_critic_obs, target_critic_act) target_q = reward_n[i] + (1 - done_n[i]) * self.gamma * target_q_next
current_q = agent.critic(current_critic_obs, current_critic_act)
critic_loss = F.mse_loss(current_q, target_q)
curr_pol_out = agent.actor(obs_n[i]) actor_input_act_n = [a.detach() for a in act_n] actor_input_act_n[i] = curr_pol_out critic_input_act_update = torch.cat(actor_input_act_n, dim=1)
actor_loss = -agent.critic(current_critic_obs, critic_input_act_update).mean()
agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step()
agent.critic_optimizer.zero_grad() critic_loss.backward() agent.critic_optimizer.step()
agent.update_target()
|