Best Python code snippet using pandera_python
nash_dqn_speed.py
Source:nash_dqn_speed.py
1import torch2import torch.nn as nn3import torch.nn.functional as F4import numpy as np5import gym6import operator7import random, copy8import pickle9from ..common.nn_components import cReLU, Flatten10from ..common.storage import ReplayBuffer11from ..common.rl_utils import choose_optimizer, EpsilonScheduler12from ..common.networks import NetBase, get_model13from .dqn import DQN, DQNBase14from mars.equilibrium_solver import NashEquilibriumECOSSolver15DEBUG = False16def kl(p, q):17 """Kullback-Leibler divergence D(P || Q) for discrete distributions18 Parameters19 ----------20 p, q : array-like, dtype=float, shape=n21 Discrete probability distributions.22 """23 p = np.asarray(p, dtype=np.float)24 q = np.asarray(q, dtype=np.float)25 return np.sum(np.where(p != 0, p * np.log(p / q), 0))26class Debugger():27 def __init__(self, env, log_path = None):28 self.env = env29 if env.OneHotObs:30 self.num_states_per_step = int(self.env.observation_space.shape[0])31 else:32 self.num_states_per_step = int(self.env.observation_space.high[0]/(self.env.max_transition+1))33 self.max_transition = env.max_transition34 self.kl_dist_list=[[] for _ in range(self.max_transition)]35 self.mse_v_list=[[] for _ in range(self.max_transition)]36 self.mse_exp_list=[[] for _ in range(self.max_transition)]37 self.cnt = 038 self.save_interval = 1039 self.logging = {'num_states_per_step': self.num_states_per_step,40 'max_transition': self.max_transition,41 'cnt': [],42 'state_visit': {},43 'kl_nash_dist': [],44 'mse_nash_v': [],45 'mse_exploitability': []46 }47 self.log_path = log_path 48 self.state_list = []49 self.oracle_nash_strategies = np.vstack(self.env.Nash_strategies) # flatten to shape dim 150 self.oracle_nash_values = np.concatenate(self.env.Nash_v) # flatten to shape dim 151 self.oracle_nash_q_values = np.concatenate(self.env.Nash_q) # flatten to shape dim 152 def compare_with_oracle(self, state, dists, ne_vs, verbose=False):53 """[summary]54 :param state: current state55 :type state: [type]56 :param dists: predicted Nash strategies (distributions)57 :type dists: [type]58 :param ne_vs: predicted Nash equilibrium values based on predicted Nash strategies59 :type ne_vs: [type]60 :param verbose: [description], defaults to False61 :type verbose: bool, optional62 """63 self.cnt+=164 if self.env.OneHotObs:65 state_ = state[0].cpu().numpy()66 id_state = np.where(state_>0)[0][0]67 else:68 id_state = int(torch.sum(state).cpu().numpy()/2)69 for j in range(self.max_transition): # nash value for non-terminal states (before the final timestep)70 if id_state >= j*self.num_states_per_step and id_state < (j+1)*self.num_states_per_step: # determine which timestep is current state71 ne_strategy = self.oracle_nash_strategies[id_state]72 ne_v = self.oracle_nash_values[id_state]73 ne_q = self.oracle_nash_q_values[id_state]74 oracle_first_player_ne_strategy = ne_strategy[0]75 nash_dqn_first_player_ne_strategy = dists[0][0]76 br_v = np.min(nash_dqn_first_player_ne_strategy@ne_q) # best response value (value against best response), reflects exploitability of learned Nash 77 kl_dist = kl(oracle_first_player_ne_strategy, nash_dqn_first_player_ne_strategy)78 self.kl_dist_list[j].append(kl_dist)79 mse_v = float((ne_v - ne_vs)**2) # squared error of Nash values (predicted and oracle)80 self.mse_v_list[j].append(mse_v)81 mse_exp = float((ne_v - br_v)**2) # the target value of best response value (exploitability) should be the Nash value82 self.mse_exp_list[j].append(mse_exp)83 self.state_visit(id_state)84 self.log([id_state, kl_dist, ne_vs], verbose)85 if self.cnt % self.save_interval == 0:86 self.dump_log()87 def state_visit(self, state):88 self.state_list.append(state)89 def log(self, data, verbose=False):90 # get state visitation statistics91 unique, counts = np.unique(self.state_list, return_counts=True)92 state_stat = dict(zip(unique, counts))93 if verbose:94 print('state index: {}ï¼ KL: {}'.format(*data))95 print('state visitation counts: {}'.format(state_stat))96 self.logging['cnt'].append(self.cnt)97 self.logging['state_visit'] = state_stat98 self.logging['kl_nash_dist'] = self.kl_dist_list99 self.logging['mse_nash_v'] = self.mse_v_list100 self.logging['mse_exploitability'] = self.mse_exp_list101 def dump_log(self,):102 with open(self.log_path, "wb") as f:103 pickle.dump(self.logging, f) 104class NashDQNSpeed(DQN):105 """106 Nash-DQN algorithm107 """108 def __init__(self, env, args):109 super().__init__(env, args)110 self.num_envs = args.num_envs111 self.model = NashDQNBase(env, args.net_architecture, args.num_envs, two_side_obs = args.marl_spec['global_state']).to(self.device)112 self.target = copy.deepcopy(self.model).to(self.device)113 114 if args.num_process > 1:115 self.model.share_memory()116 self.target.share_memory()117 self.num_agents = env.num_agents[0] if isinstance(env.num_agents, list) else env.num_agents118 try:119 self.action_dims = env.action_space[0].n120 except:121 self.action_dims = env.action_space.n122 # don't forget to instantiate an optimizer although there is one in DQN123 self.optimizer = choose_optimizer(args.optimizer)(self.model.parameters(), lr=float(args.learning_rate))124 # lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=50, gamma=0.95) 125 # self.schedulers.append(lr_scheduler)126 if DEBUG:127 self.debugger = Debugger(env, "./data/nash_dqn_simple_mdp_log.pkl")128 self.warm_up = 500*2000 # ~ 5000 episodes, b.c. 0.1 update freq, ~2000 episode length ;warm-up steps use non-Nash update manner129 def choose_action(self, state, Greedy=False, epsilon=None):130 if Greedy:131 epsilon = 0.132 elif epsilon is None:133 epsilon = self.epsilon_scheduler.get_epsilon()134 if not isinstance(state, torch.Tensor):135 state = torch.Tensor(state).to(self.device)136 if self.num_envs == 1: # state: (agents, state_dim)137 state = state.unsqueeze(0).view(1, -1) # change state from (agents, state_dim) to (1, agents*state_dim)138 else: # state: (agents, envs, state_dim)139 state = torch.transpose(state, 0, 1) # to state: (envs, agents, state_dim)140 state = state.view(state.shape[0], -1) # to state: (envs, agents*state_dim)141 if random.random() > epsilon: # NoisyNet does not use e-greedy142 with torch.no_grad():143 q_values = self.model(state).detach().cpu().numpy() # needs state: (batch, agents*state_dim)144 145 if self.update_cnt < self.warm_up:146 q_tables = q_values.reshape(-1, self.action_dims, self.action_dims)147 actions = []148 for qt in q_tables:149 row_q = np.average(qt, axis=-1)150 col_q = np.average(qt.T, axis=-1)151 actions.append([np.argmax(row_q), np.argmin(col_q)])152 else:153 # if self.args.cce:154 # actions = self.compute_cce(q_values)155 # else:156 actions, dists, ne_vs = self.compute_nash(q_values) 157 if DEBUG: ## test on arbitrary MDP158 self.debugger.compare_with_oracle(state, dists, ne_vs, verbose=True)159 else:160 actions = np.random.randint(self.action_dims, size=(state.shape[0], self.num_agents)) # (envs, agents)161 162 if self.num_envs == 1:163 actions = actions[0] # list of actions to its item164 else:165 actions = np.array(actions).T # to shape: (agents, envs, action_dim)166 return actions167 def compute_nash(self, q_values, return_dist_only=False):168 """169 Return actions as Nash equilibrium of given payoff matrix, shape: [env, agent]170 """171 q_tables = q_values.reshape(-1, self.action_dims, self.action_dims)172 all_actions = []173 all_dists = []174 all_ne_values = []175 for qs in q_tables: # iterate over envs176 # Solve Nash equilibrium with solver177 try:178 # ne = NashEquilibriaSolver(qs)179 # ne = ne[0] # take the first Nash equilibria found180 # print(np.linalg.det(qs))181 # ne = NashEquilibriumSolver(qs)182 # ne = NashEquilibriumLPSolver(qs)183 # ne = NashEquilibriumCVXPYSolver(qs)184 # ne = NashEquilibriumGUROBISolver(qs)185 ne, ne_v = NashEquilibriumECOSSolver(qs)186 except: # some cases NE cannot be solved187 print('No Nash solution for: ', np.linalg.det(qs), qs)188 ne = self.num_agents*[1./qs.shape[0]*np.ones(qs.shape[0])] # use uniform distribution if no NE is found189 ne_v = 0190 all_dists.append(ne)191 all_ne_values.append(ne_v)192 # Sample actions from Nash strategies193 actions = []194 for dist in ne: # iterate over agents195 try:196 sample_hist = np.random.multinomial(1, dist) # return one-hot vectors as sample from multinomial197 except:198 print('Not a valid distribution from Nash equilibrium solution.')199 print(sum(ne[0]), sum(ne[1]))200 print(qs, ne)201 print(dist)202 a = np.where(sample_hist>0)203 actions.append(a)204 all_actions.append(np.array(actions).reshape(-1))205 if return_dist_only:206 return all_dists207 else: # return samples actions, nash strategies, nash values208 return np.array(all_actions), all_dists, all_ne_values209 def compute_cce(self, q_values, return_dist=False):210 """211 Return actions as coarse correlated equilibrium of given payoff matrix, shape: [env, agent]212 """213 q_tables = q_values.reshape(-1, self.action_dims, self.action_dims)214 all_actions = []215 all_dists = []216 for qs in q_tables: # iterate over envs217 try:218 _, _, jnt_probs = CoarseCorrelatedEquilibriumLPSolver(qs)219 except: # some cases NE cannot be solved220 print('No CCE solution for: ', np.linalg.det(qs), qs)221 jnt_probs = 1./(qs.shape[0]*qs.shape[1])*np.ones(qs.shape[0]*qs.shape[1]) # use uniform distribution if no NE is found222 223 try:224 sample_hist = np.random.multinomial(1, jnt_probs) # a joint probability matrix for all players225 except:226 print('Not a valid distribution from Nash equilibrium solution.')227 print(sum(jnt_probs), sum(abs(jnt_probs)))228 print(qs, jnt_probs)229 sample_hist = sample_hist.reshape(self.action_dims, self.action_dims)230 a = np.where(sample_hist>0) # the actions for two players231 all_actions.append(np.array(a).reshape(-1))232 all_dists.append(jnt_probs)233 if return_dist:234 return all_dists235 else:236 return np.array(all_actions)237 def update(self):238 state, action, reward, next_state, done = self.buffer.sample(self.batch_size)239 state = torch.FloatTensor(np.float32(state)).to(self.device)240 next_state = torch.FloatTensor(np.float32(next_state)).to(self.device)241 action = torch.FloatTensor(action).to(self.device)242 reward = torch.FloatTensor(reward).to(self.device)243 done = torch.FloatTensor(np.float32(done)).to(self.device)244 # Q-Learning with target network245 q_values = self.model(state)246 target_next_q_values_ = self.target(next_state)247 target_next_q_values = target_next_q_values_.detach().cpu().numpy()248 action_dim = int(np.sqrt(q_values.shape[-1])) # for two-symmetric-agent case only249 action_ = torch.LongTensor([a[0]*action_dim+a[1] for a in action]).to(self.device)250 q_value = q_values.gather(1, action_.unsqueeze(1)).squeeze(1)251 # compute CCE or NE252 # if args.cce: # Coarse Correlated Equilibrium253 # cce_dists = self.compute_cce(target_next_q_values, return_dist=True)254 # target_next_q_values_ = target_next_q_values_.reshape(-1, action_dim, action_dim)255 # cce_dists_ = torch.FloatTensor(cce_dists).to(self.device)256 # next_q_value = torch.einsum('bij,bij->b', cce_dists_, target_next_q_values_)257 # else: # Nash Equilibrium258 if self.update_cnt < self.warm_up:259 expected_q_value = reward260 else:261 nash_dists = self.compute_nash(target_next_q_values, return_dist_only=True) # get the mixed strategy Nash rather than specific actions262 target_next_q_values_ = target_next_q_values_.reshape(-1, action_dim, action_dim)263 nash_dists_ = torch.FloatTensor(nash_dists).to(self.device)264 next_q_value = torch.einsum('bk,bk->b', torch.einsum('bj,bjk->bk', nash_dists_[:, 0], target_next_q_values_), nash_dists_[:, 1])265 266 expected_q_value = reward + (self.gamma ** self.multi_step) * next_q_value * (1 - done) 267 # Huber Loss268 # loss = F.smooth_l1_loss(q_value, expected_q_value.detach(), reduction='none')269 loss = F.mse_loss(q_value, expected_q_value.detach())270 loss = loss.mean()271 self.optimizer.zero_grad()272 loss.backward()273 self.optimizer.step()274 if self.update_cnt % self.target_update_interval == 0:275 self.update_target(self.model, self.target)276 # self.update_cnt = 0277 self.update_cnt += 1278 return loss.item()279class NashDQNBase(DQNBase):280 """281 Nash-DQN for parallel env sampling282 parameters283 ---------284 env environment(openai gym)285 """286 def __init__(self, env, net_args, number_envs=2, two_side_obs=True):287 super().__init__(env, net_args)288 self.number_envs = number_envs289 try:290 if two_side_obs:291 self._observation_shape = tuple(map(operator.add, env.observation_space.shape, env.observation_space.shape)) # double the shape292 else:293 self._observation_shape = env.observation_space.shape294 self._action_shape = (env.action_space.n)**2295 except:296 if two_side_obs:297 self._observation_shape = tuple(map(operator.add, env.observation_space[0].shape, env.observation_space[0].shape)) # double the shape298 else:299 self._observation_shape = env.observation_space[0].shape300 self._action_shape = (env.action_space[0].n)**2301 self._construct_net(env, net_args)302 def _construct_net(self, env, net_args):303 input_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape = self._observation_shape)304 output_space = gym.spaces.Discrete(self._action_shape)305 if len(self._observation_shape) <= 1: # not image306 self.net = get_model('mlp')(input_space, output_space, net_args, model_for='discrete_q')307 else:...
blockwithholding.py
Source:blockwithholding.py
1from __future__ import absolute_import2from __future__ import division3from __future__ import print_function4import random5import numpy as np6import argparse7from gym import spaces8import ray9from ray.tune.registry import register_env10from ray.rllib.models.preprocessors import get_preprocessor11from ray import tune12from ray.rllib.agents.pg.pg import PGTrainer13from ray.rllib.agents.pg.pg_policy import PGTFPolicy14from ray.rllib.policy.policy import Policy15from ray.rllib.env.multi_agent_env import MultiAgentEnv16from ray.rllib.utils import try_import_tf17from ray.tune.util import flatten_dict18from ray.tune.result import (NODE_IP, TRAINING_ITERATION, TIME_TOTAL_S,19 TIMESTEPS_TOTAL, EXPR_PARAM_FILE,20 EXPR_PARAM_PICKLE_FILE, EXPR_PROGRESS_FILE,21 EXPR_RESULT_FILE)22from ray.tune.logger import pretty_print23import os24import csv25import mdptoolbox26import pandas as pd27import math28import sympy as sym29tf = try_import_tf()30CLI = argparse.ArgumentParser()31CLI.add_argument(32 "--alphas",33 nargs = '*',34 type = float,35 default = [.4, .5]36)37CLI.add_argument(38 "--impact",39 type = float,40 default = .241)42CLI.add_argument(43 "--threshold",44 type = float,45 default = .0246)47CLI.add_argument(48 '--algo',49 type = str,50 default = 'PPO'51)52CLI.add_argument(53 '--use_lstm',54 type = bool,55 default = False56)57CLI.add_argument(58 '--gamma',59 type = float,60 default = 0.9961)62CLI.add_argument(63 '--lr',64 type = float,65 default = 1e-666)67CLI.add_argument(68 '--lmbda',69 type = float,70 default = 1.071)72CLI.add_argument(73 '--iteration',74 type = int,75 default = 1076)77CLI.add_argument(78 '--episodes',79 type = int,80 default = 1e681)82CLI.add_argument(83 '--ep_length',84 type = int,85 default = 186)87CLI.add_argument(88 '--gpus',89 type = int,90 default = 091)92CLI.add_argument(93 '--NE',94 type = bool,95 default = False96)97CLI.add_argument(98 '--workers',99 type = int,100 default = 5101)102CLI.add_argument(103 '--evaluate',104 type = bool,105 default = False106)107CLI.add_argument(108 '--eval_ep',109 type = int,110 default = 1111)112args = CLI.parse_args()113eps = 1e-6114#setting in miner's dilemma115ACTION_SPACE = spaces.Box(low=np.array([0.]), high=np.array([1.]), dtype=np.float32)116STATE_SPACE = spaces.Discrete(1)117NE = dict()118def get_optimal_strategy(a, b, y):119 x = sym.Symbol('x', real=True)120 R1 = (a - x) / (1. - x - y)121 R2 = (b - y) / (1. - x - y)122 r1 = ((b * R1) + x * (R1 + R2)) / (a * b + a * x + b * y)123 d1 = sym.Eq(sym.diff(r1, x), 0.)124 A = sym.solve(d1, x)125 126 if A:127 for i in A:128 if (i > eps and i < a - eps):129 return i, r1.subs(x, i)130 if (a * b + b * y < eps or r1.subs(x, 0.) > r1.subs(x, a) - eps):131 return 0., r1.subs(x, 0.)132 else:133 return a, r1.subs(x, a)134def plot_Nash_equilibrium(x, y, z, name):135 x, y = np.meshgrid(x,y)136 z = z.transpose()137 intensity = z.reshape(len(y), len(x))138 139 plt.title(name)140 plt.pcolormesh(x, y, intensity, rasterized=True)141 plt.clim(0., 1.2)142 plt.colorbar() #need a colorbar to show the intensity scale143 #plt.show() #boom144 145def compute_reward(a, b, x, y):146 if (x + y > 1 - eps):147 return {'0': 0., '1': 0.}148 if (y < eps and a < eps):149 return {'0': 1., '1': 1.}150 if (x < eps and b < eps):151 return {'0': 1., '1': 1.}152 R1 = (a - x) / (1. - x - y)153 R2 = (b - y) / (1. - x - y)154 r1 = ((b * R1) + x * (R1 + R2)) / (a * b + a * x + b * y)155 r2 = ((a * R2) + y * (R1 + R2)) / (a * b + a * x + b * y)156 return {'0': r1, '1': r2}157def get_Nash_equilibrium(alphas):158 a = alphas[0]159 b = alphas[1]160 if (a + b > 1. or (a < eps and b < eps)): 161 return 0., 0., 1., 1.162 x = 0.163 y = 0.164 while (True):165 X, R1 = get_optimal_strategy(a, b, y)166 Y, R2 = get_optimal_strategy(b, a, x)167 168 if (abs(X - x) < eps and abs(Y - y) < eps):169 rev = compute_reward(a, b, x, y)170 return x, y, rev['0'], rev['1']171 172 x = X173 y = Y174class MigrationEnv(MultiAgentEnv):175 def __init__(self, env_config):176 self.action_space = ACTION_SPACE177 self.observation_space = STATE_SPACE178 self.HASHRATE = np.array(env_config['alphas'])179 self.alphas = np.array(env_config['alphas'])180 self.N = len(self.alphas)181 self.episode_length = env_config['ep_length']182 self.attr = np.full((self.N), 1.)183 self.impact = args.impact184 self.threshold = args.threshold185 self.largest_pool = np.full((self.N, 2), -1)186 self.num_moves = 0187 188 def compute_states(self):189 obs_state = dict()190 self.largest_pool = np.full((self.N, 2), -1)191 for i in range(len(self.alphas)):192 tmp = np.array([self.alphas[i], 0., 0., 0.])193 rest = []194 195 for j in range(len(self.alphas)):196 if i == j:197 continue198 if self.alphas[j] >= tmp[1]:199 if (self.largest_pool[i][1] > -1):200 rest.append(tmp[2])201 tmp[2] = tmp[1]202 self.largest_pool[i][1] = self.largest_pool[i][0]203 tmp[1] = self.alphas[j]204 self.largest_pool[i][0] = j205 elif self.alphas[j] > tmp[2]:206 if (self.largest_pool[i][1] > -1):207 rest.append(tmp[2])208 tmp[2] = self.alphas[j]209 self.largest_pool[i][1] = j210 else: 211 rest.append(self.alphas[j])212 tmp[3] = np.array(rest).std()213 obs_state[str(i)] = tmp214 return obs_state215 #reset the environment to the starting state216 def reset(self):217 self.num_moves = 0218 self.alphas = np.array(self.HASHRATE)219 self.attr = np.full((self.N), 1.)220 return self.compute_states() 221 def construct_action(self, action_dict):222 action = np.empty([self.N, self.N], dtype=np.float32)223 for i in range(self.N):224 action[i] = np.full((self.N), self.alphas[i] * action_dict[str(i)][2])225 action[i][i] = 0.226 if self.largest_pool[i][0] > -1:227 action[i][self.largest_pool[i][0]] = self.alphas[i] * action_dict[str(i)][0]228 if self.largest_pool[i][1] > -1:229 action[i][self.largest_pool[i][1]] = self.alphas[i] * action_dict[str(i)][1]230 if (action[i].sum() > 1 - eps):231 action[i] = action[i] / (action[i] + eps)232 233 return action234 235 def step(self, action_dict):236 self.num_moves += 1237 a = np.empty([self.N, self.N], dtype=np.float32)238 b = np.empty([self.N], dtype=np.float32)239 action = self.construct_action(action_dict)240 #print("states:{}\n{}\n{}\n".format(self.compute_states(), action_dict, action))241 infiltrate = action.sum(1)242 infiltrated = action.sum(0)243 total = action.sum()244 for i in range(self.N):245 for j in range(self.N):246 if i == j:247 a[i][j] = self.alphas[i] + infiltrated[i]248 else:249 a[i][j] = -action[i][j]250 b[i] = (self.alphas[i] - infiltrate[i]) / (1 - total)251 r = np.empty([self.N], dtype=np.float32)252 try:253 r = np.linalg.solve(a, b)254 except(RuntimeError, np.linalg.LinAlgError):255 r = np.full((self.N), 1.)256 R = dict()257 for i in range(self.N):258 R[str(i)] = r[i]259 done = {"__all__": self.num_moves >= self.episode_length}260 for i in range(self.N):261 self.attr[i] = max(0., min(1., self.attr[i] + self.impact * (r[i] - 1.)))262 tmp_alphas = np.array(self.alphas)263 for i in range(self.N):264 sumn = tmp_alphas[i] * max(0., 1. - self.attr[i] - self.threshold)265 self.alphas[i] -= sumn266 mean = np.array(self.attr) / self.attr.sum()267 cov = np.diag(mean) - np.dot(np.transpose([mean]), [mean])268 mig = np.random.multivariate_normal(sumn * mean, sumn * cov)269 for j in range(self.N):270 #self.alphas[i] += tmp_alphas[j] * max(0, 1 - self.attr[j] - self.threshold) * self.attr[i] / self.attr.sum()271 self.alphas[j] += mig[j]272 assert(abs(self.alphas.sum() - 1.) < eps)273 274 alphas = dict()275 for i in range(self.N):276 alphas[str(i)] = self.alphas[i] - tmp_alphas[i]277 278 info = dict()279 for i in range(self.N):280 info[str(i)] = {'policy': np.array(action[i]), 'reward': r[i], 'alphas': self.alphas[i]}281 return self.compute_states(), alphas, done, info282class BlockWithholdingEnv(MultiAgentEnv):283 def __init__(self, env_config):284 self.action_space = ACTION_SPACE285 self.observation_space = STATE_SPACE286 self.alphas = env_config['alphas']287 self.N = len(self.alphas)288 self.honest_power = 1 - sum(self.alphas)289 self.episode_length = env_config['ep_length']290 self.num_moves = 0291 292 #reset the environment to the starting state293 def reset(self):294 self.num_moves = 0295 return {296 '0': 0,297 '1': 0298 }299 300 def step(self, action_dict):301 self.num_moves += 1302 a = self.alphas[0]303 b = self.alphas[1]304 x = action_dict['0'][0]305 y = action_dict['1'][0]306 done = {"__all__": self.num_moves >= self.episode_length}307 R = compute_reward(a, b, x * a, y * b)308 info = dict()309 info['0'] = {'policy': x * a, 'reward': R['0']}310 info['1'] = {'policy': y * b, 'reward': R['1']}311 return {'0': 0, '1': 0}, R, done, info312class Constant(Policy):313 def __init__(self, observation_space, action_space, config):314 Policy.__init__(self, observation_space, action_space, config)315 self.infiltrating = config['infiltrating']316 def compute_actions(self,317 obs_batch,318 state_batches,319 prev_action_batch=None,320 prev_reward_batch=None,321 info_batch=None,322 episodes=None,323 **kwargs):324 actions = []325 for i in range(len(obs_batch)):326 actions.append([self.infiltrating])327 return actions, [], {}328 def learn_on_batch(self, samples):329 pass330 def get_weights(self):331 pass332 def set_weights(self, weights):333 pass334class NE_strategy(Policy):335 def __init__(self, observation_space, action_space, config):336 Policy.__init__(self, observation_space, action_space, config)337 x, y, r1, r2 = get_Nash_equilibrium(config['alphas'])338 self.infiltrating = y / config['alphas'][1]339 340 341 def compute_actions(self,342 obs_batch,343 state_batches,344 prev_action_batch=None,345 prev_reward_batch=None,346 info_batch=None,347 episodes=None,348 **kwargs):349 actions = []350 for i in range(len(obs_batch)):351 actions.append([self.infiltrating])352 return actions, [], {}353 def learn_on_batch(self, samples):354 pass355 def get_weights(self):356 pass357 def set_weights(self, weights):358 pass359def on_episode_start(info):360 episode = info["episode"]361def on_episode_step(info):362 episode = info["episode"]363 episode.user_data['0'] = episode.last_info_for('0')364 episode.user_data['1'] = episode.last_info_for('1')365def on_episode_end(info):366 episode = info["episode"]367 print(episode.user_data)368def run_RL(policies_to_train, policies):369 def select_policy(agent_id):370 return agent_id371 372 tune.run(373 args.algo,374 stop={"episodes_total": args.episodes},375 config={376 "num_gpus": args.gpus,377 "env": BlockWithholdingEnv,378 "entropy_coeff": 0.01,379 "entropy_coeff_schedule": args.episodes * 1000,380 "clip_param": 0.1,381 "gamma": args.gamma,382 "lambda": args.lmbda,383 "lr_schedule": [[0, 1e-5], [args.episodes, 1e-7]],384 "num_workers": args.workers,385 "num_envs_per_worker": 1,386 "sample_batch_size": 10,387 "train_batch_size": 128,388 "multiagent": {389 "policies_to_train": policies_to_train,390 "policies": policies,391 "policy_mapping_fn": select_policy,392 },393 "env_config": {394 "alphas":args.alphas,395 'ep_length':args.ep_length396 },397 "monitor": True,398 "callbacks": {399 "on_episode_start": on_episode_start,400 "on_episode_step": on_episode_step,401 "on_episode_end": on_episode_end,402 },403 "ignore_worker_failures": True,404 })405NE['a0'], NE['a1'], NE['r1'], NE['r2'] = get_Nash_equilibrium(args.alphas)406print(args.alphas, NE)407policies_to_train = [str(i) for i in range(len(args.alphas))]408policies = dict()409for i in range(len(args.alphas)):410 policies[str(i)] = (None, STATE_SPACE, ACTION_SPACE, {411 "model": {412 "use_lstm":args.use_lstm413 }414 })...
debug.py
Source:debug.py
1import torch2import numpy as np3import pickle4from mars.equilibrium_solver import NashEquilibriumECOSSolver, NashEquilibriumMWUSolver, NashEquilibriumParallelMWUSolver5DEBUG = False6def kl(p, q):7 """Kullback-Leibler divergence D(P || Q) for discrete distributions8 Parameters9 ----------10 p, q : array-like, dtype=float, shape=n11 Discrete probability distributions.12 """13 p = np.asarray(p, dtype=np.float)14 q = np.asarray(q, dtype=np.float)15 return np.sum(np.where(p != 0, p * np.log(p / q), 0))16def to_one_hot(s, range):17 one_hot_vec = np.zeros(range)18 one_hot_vec[s] = 119 return one_hot_vec20class Debugger():21 def __init__(self, env, log_path = None):22 self.env = env23 if env.OneHotObs:24 self.num_states_per_step = int(self.env.observation_space.shape[0])25 else:26 self.num_states_per_step = int(self.env.observation_space.high[0]/(self.env.max_transition+1))27 self.max_transition = env.max_transition28 self.kl_dist_list=[[] for _ in range(self.max_transition)]29 self.mse_v_list=[[] for _ in range(self.max_transition)]30 self.mse_exp_list=[[] for _ in range(self.max_transition)]31 self.brv_list = []32 self.cnt = 033 self.save_interval = 1034 self.logging = {'num_states_per_step': self.num_states_per_step,35 'max_transition': self.max_transition,36 'oracle_exploitability': np.mean(self.env.Nash_v[0], axis=0), # the average nash value for initial states from max-player's view37 'cnt': [],38 'state_visit': {},39 'kl_nash_dist': [],40 'mse_nash_v': [],41 'mse_exploitability': []42 }43 self.log_path = log_path 44 self.state_list = []45 self.oracle_nash_strategies = np.vstack(self.env.Nash_strategies) # flatten to shape dim 146 self.oracle_nash_values = np.concatenate(self.env.Nash_v) # flatten to shape dim 147 self.oracle_nash_q_values = np.concatenate(self.env.Nash_q) # flatten to shape dim 148 self.trans_prob_matrices = self.env.env.trans_prob_matrices49 self.reward_matrices = self.env.env.reward_matrices50 print('oracle nash v star: ', np.mean(self.env.Nash_v[0], axis=0)) # the average nash value for initial states from max-player's view51 def best_response_value(self, learned_q):52 """53 Formulas for calculating best response values:54 1. Nash strategies: (\pi_a^*, \pi_b^*) = \min \max Q(s,a,b), 55 where Q(s,a,b) = r(s,a,b) + \gamma \min \max Q(s',a',b') (this is the definition of Nash Q-value);56 2. Best response (of max player) value: Br V(s) = \min_b \pi(s,a) Q(s,a,b)57 """58 Br_v = []59 Br_q = []60 Nash_strategies = []61 num_actions = learned_q.shape[-1]62 for tm, rm, qm in zip(self.trans_prob_matrices[::-1], self.reward_matrices[::-1], learned_q[::-1]): # inverse enumerate 63 if len(Br_v) > 0:64 rm = np.array(rm)+np.array(Br_v[-1]) # broadcast sum on rm's last dim, last one in Nash_v is for the next state65 br_q_values = np.einsum("ijk,ijk->ij", tm, rm) # transition prob * reward for the last dimension in (state, action, next_state)66 br_q_values = br_q_values.reshape(-1, num_actions, num_actions) # action list to matrix67 Br_q.append(br_q_values)68 br_values = []69 ne_strategies = []70 for q, br_q in zip(qm, br_q_values):71 ne, _ = NashEquilibriumECOSSolver(q)72 ne_strategies.append(ne)73 br_value = np.min(ne[0]@br_q) # best response againt "Nash" strategy of first player74 br_values.append(br_value) # each value is a Nash equilibrium value on one state75 Br_v.append(br_values) # (trans, state)76 Nash_strategies.append(ne_strategies)77 Br_v = Br_v[::-1] # (#trans, #states)78 Br_q = Br_q[::-1]79 Nash_strategies = Nash_strategies[::-1]80 avg_init_br_v = -np.mean(Br_v[0]) # average best response value of initial states; minus for making it positive81 return avg_init_br_v82 def compare_with_oracle(self, state, dists, ne_vs, ne_q_vs, verbose=False):83 """[summary]84 :param state: current state85 :type state: [type]86 :param dists: predicted Nash strategies (distributions)87 :type dists: [type]88 :param ne_vs: predicted Nash equilibrium values based on predicted Nash strategies89 :type ne_vs: [type]90 :param verbose: [description], defaults to False91 :type verbose: bool, optional92 """93 self.cnt+=194 if self.env.OneHotObs:95 state_ = state[0].cpu().numpy()96 id_state = np.where(state_>0)[0][0]97 else:98 id_state = int(torch.sum(state).cpu().numpy()/2)99 for j in range(self.max_transition): # nash value for non-terminal states (before the final timestep)100 if id_state >= j*self.num_states_per_step and id_state < (j+1)*self.num_states_per_step: # determine which timestep is current state101 ne_strategy = self.oracle_nash_strategies[id_state]102 ne_v = self.oracle_nash_values[id_state]103 ne_q = self.oracle_nash_q_values[id_state]104 oracle_first_player_ne_strategy = ne_strategy[0]105 nash_dqn_first_player_ne_strategy = dists[0][0]106 br_v = np.min(nash_dqn_first_player_ne_strategy@ne_q) # best response value (value against best response), reflects exploitability of learned Nash; but this minimization is taken with oracle nash 107 kl_dist = kl(oracle_first_player_ne_strategy, nash_dqn_first_player_ne_strategy)108 self.kl_dist_list[j].append(kl_dist)109 mse_v = float((ne_v - ne_vs)**2) # squared error of Nash values (predicted and oracle)110 self.mse_v_list[j].append(mse_v)111 ### this is the exploitability/regret for each state; but not calcuated correctly, the minimization should take over best-response Q value rather than nash Q (neither oracle nor learned)112 mse_exp = float((ne_v - br_v)**2) # the target value of best response value (exploitability) should be the Nash value113 self.mse_exp_list[j].append(mse_exp)114 ## this is the correct calculation of exploitability: average best-response value of the inital states115 brv = self.best_response_value(ne_q_vs, )116 self.brv_list.append(brv)117 self.state_visit(id_state)118 self.log([id_state, kl_dist, ne_vs], verbose)119 if self.cnt % self.save_interval == 0:120 self.dump_log()121 def state_visit(self, state):122 self.state_list.append(state)123 def log(self, data, verbose=False):124 # get state visitation statistics125 unique, counts = np.unique(self.state_list, return_counts=True)126 state_stat = dict(zip(unique, counts))127 if verbose:128 print('state index: {}ï¼ KL: {}'.format(*data))129 print('state visitation counts: {}'.format(state_stat))130 self.logging['cnt'].append(self.cnt)131 self.logging['state_visit'] = state_stat132 self.logging['kl_nash_dist'] = self.kl_dist_list133 self.logging['mse_nash_v'] = self.mse_v_list134 self.logging['mse_exploitability'] = self.mse_exp_list135 self.logging['brv'] = self.brv_list136 def dump_log(self,):137 with open(self.log_path, "wb") as f:...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!