How to use store_results_dir method in autotest

Best Python code snippet using autotest_python

Trainer.py

Source:Trainer.py Github

copy

Full Screen

1# ###2# Mathematical expression recognition tool.3# Written as a part of masters thesis at VUT FIT Brno, 20224# Author: Vladislav Halva5# Login: xhalva046# ###7import logging8import os.path9from datetime import datetime10import numpy as np11import torch12from torch import optim13from torch.utils.tensorboard import SummaryWriter14from torch_geometric.loader import DataLoader15from tqdm import tqdm16from src.data.ImageTransformation import ImageTransformation17from src.data.GMathDataset import CrohmeDataset18from src.data.LatexVocab import LatexVocab19from src.definitions.SltEdgeTypes import SltEdgeTypes20from src.definitions.exceptions.ModelParamsError import ModelParamsError21from src.model.Model import Model22from src.utils.loss import calculate_loss23from src.utils.utils import create_attn_gt, split_databatch, compute_single_item_stats, create_latex_result_file24class Trainer:25 """26 Wrapper for training a evaluating the model.27 """28 def __init__(self, config):29 """30 Configures model, tokenizer and summary writer.31 :param config: configuration setup - dictionary generated with Config class32 """33 self.config = config34 # define metaparameters35 self.components_shape = (32, 32)36 self.edge_features = 1037 self.edge_h_size = config['model']['encoder_edge_fsize']38 self.enc_in_size = config['model']['encoder_in_node_fsize']39 self.enc_h_size = config['model']['encoder_hidden_node_fsize']40 self.enc_out_size = config['model']['encoder_out_node_fsize']41 self.dec_in_size = config['model']['decoder_in_fsize']42 self.dec_h_size = config['model']['decoder_hidden_fsize']43 self.emb_size = config['model']['decoder_embed_fsize']44 self.dec_att_size = config['model']['decoder_attn_size']45 self.enc_vgg_dropout_p = config['model']['dropout_encoder_vgg']46 self.enc_gat_dropout_p = config['model']['dropout_encoder_gat']47 self.dec_emb_dropout_p = config['model']['dropout_decoder_init_embed']48 self.dec_att_dropout_p = config['model']['dropout_decoder_attention']49 self.substitute_terms = False50 # use GPU if available51 if config['device'] == 'gpu':52 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")53 else:54 self.device = torch.device("cpu")55 logging.info(f"Device: {self.device}")56 # load or create tokenizer57 if config['vocabulary']['load_tokenizer'] and os.path.exists(config['vocabulary']['tokenizer_filepath']):58 # load59 self.tokenizer = LatexVocab.load_tokenizer(config['vocabulary']['tokenizer_filepath'])60 logging.info(f"Tokenizer loaded from: {config['vocabulary']['tokenizer_filepath']}")61 elif config['vocabulary']['inkml_folder_for_vocab'] is not None and \62 config['vocabulary']['vocab_filepath'] is not None and \63 os.path.exists(config['vocabulary']['vocab_filepath']) and \64 os.path.exists(config['vocabulary']['inkml_folder_for_vocab']):65 # create66 LatexVocab.generate_formulas_file_from_inkmls(67 config['vocabulary']['inkml_folder_for_vocab'],68 config['vocabulary']['vocab_filepath'],69 substitute_terms=self.substitute_terms,70 latex_gt=False, mathml_gt=True)71 self.tokenizer = LatexVocab.create_tokenizer(config['vocabulary']['vocab_filepath'], min_freq=2)72 LatexVocab.save_tokenizer(self.tokenizer, config['vocabulary']['tokenizer_filepath'])73 logging.info(f"Tokenizer created as: {config['vocabulary']['tokenizer_filepath']}")74 else:75 raise ModelParamsError('Vocabulary could not be initialized')76 self.vocab_size = self.tokenizer.get_vocab_size()77 self.end_node_token_id = self.tokenizer.encode("[EOS]", add_special_tokens=False).ids[0]78 logging.info(f"Vocab size: {self.vocab_size}")79 # init model80 now = datetime.now()81 self.model_name = config['model']['model_name'] + '_' + now.strftime("%y-%m-%d_%H-%M-%S")82 self.model = Model(83 self.device, self.edge_features, self.edge_h_size,84 self.enc_in_size, self.enc_h_size, self.enc_out_size, self.dec_in_size, self.dec_h_size, self.emb_size,85 self.dec_att_size,86 self.vocab_size, self.end_node_token_id, self.tokenizer,87 self.enc_vgg_dropout_p, self.enc_gat_dropout_p, self.dec_emb_dropout_p, self.dec_att_dropout_p)88 self.model.double()89 #load mode state dict90 if config['model']['load'] and os.path.exists(config['model']['load_state_dict']):91 self.model.load_state_dict(torch.load(config['model']['load_state_dict'], map_location=self.device))92 logging.info(f"Model loaded: {config['model']['load_state_dict']}")93 self.model.to(self.device)94 # init summary writer95 if config['writer_path'] is not None:96 if os.path.exists(config['writer_path']):97 self.writer = SummaryWriter(os.path.join(config['writer_path'], self.model_name))98 else:99 logging.info("Writer could not be initialized, directory does not exist")100 self.writer = None101 else:102 self.writer = False103 # set temporary data storage104 if config['tmp_data_storage_folder'] is not None and os.path.exists(config['tmp_data_storage_folder']):105 self.temp_path = config['tmp_data_storage_folder']106 else:107 self.temp_path = None108 self.eval_during_training = False109 self.eval_train_settings = None110 self.second_eval_during_training = False111 self.second_eval_train_settings = None112 def unset_eval_during_training(self):113 """114 Disabled first evaluation settings during training.115 """116 self.eval_during_training = False117 self.eval_train_settings = None118 def unset_second_eval_during_training(self):119 """120 Disabled first evaluation settings during training.121 """122 self.second_eval_during_training = False123 self.second_eval_train_settings = None124 def set_eval_during_training(self, images_root, inkmls_root, batch_size, print_stats,125 print_item_level_stats, each_nth_epoch=1, beam_search=True, beam_width=3):126 """127 Sets first evaluation of model during training.128 :param images_root: images folder129 :param inkmls_root: inkml files folder130 :param batch_size: batch_size, if beam search needs to be zero131 :param print_stats: print statistics to STDOUT132 :param print_item_level_stats: print item level statistics to STDOUT133 :param each_nth_epoch: run evaluation each n-th epoch134 :param beam_search: if True Beam search will be used for output graph generation, Greedy search otherwise135 :param beam_width: beam width of Beam search136 """137 if os.path.exists(images_root) and os.path.exists(inkmls_root) and each_nth_epoch > 0:138 self.eval_during_training = True139 self.eval_train_settings = {140 'images_root': images_root,141 'inkmls_root': inkmls_root,142 'batch_size': batch_size,143 'print_stats': print_stats,144 'print_item_level_stats': print_item_level_stats,145 'each_nth_epoch': each_nth_epoch,146 'beam_search': beam_search,147 'beam_width': beam_width148 }149 def set_second_eval_during_training(self, images_root, inkmls_root, batch_size, print_stats,150 print_item_level_stats, each_nth_epoch=1, beam_search=True, beam_width=3):151 """152 Sets second evaluation of model during training.153 :param images_root: images folder154 :param inkmls_root: inkml files folder155 :param batch_size: batch_size, if beam search needs to be zero156 :param print_stats: print statistics to STDOUT157 :param print_item_level_stats: print item level statistics to STDOUT158 :param each_nth_epoch: run evaluation each n-th epoch159 :param beam_search: if True Beam search will be used for output graph generation, Greedy search otherwise160 :param beam_width: beam width of Beam search161 """162 if os.path.exists(images_root) and os.path.exists(inkmls_root) and each_nth_epoch > 0:163 self.second_eval_during_training = True164 self.second_eval_train_settings = {165 'images_root': images_root,166 'inkmls_root': inkmls_root,167 'batch_size': batch_size,168 'print_stats': print_stats,169 'print_item_level_stats': print_item_level_stats,170 'each_nth_epoch': each_nth_epoch,171 'beam_search': beam_search,172 'beam_width': beam_width173 }174 def train(self, images_root, inkmls_root, epochs, batch_size=1, loss_config=None, save_model_dir=None, save_checkpoint_each_nth_epoch=0):175 """176 Trains model.177 :param images_root: images folder178 :param inkmls_root: inkml files folder179 :param epochs: number of train epochs180 :param batch_size: batch size181 :param loss_config: loss configuration dictionary - coefficients of single loss components182 :param save_model_dir: folder where, model state dicts will be saved183 :param save_checkpoint_each_nth_epoch: how often save state dict checkpoint184 """185 logging.info("\nTraining...")186 optimizer = optim.Adam(self.model.parameters(), lr=0.0003)187 # augment images while training188 transform = ImageTransformation()189 trainset = CrohmeDataset(images_root, inkmls_root, self.tokenizer, self.components_shape, self.temp_path, self.substitute_terms, transform=transform)190 trainloader = DataLoader(trainset, batch_size, True, follow_batch=['x', 'tgt_y', 'gt', 'gt_ml', 'filename'])191 self.model.train()192 for epoch in range(epochs):193 epoch_loss = 0194 epoch_stats = {195 'symbol_count': 0,196 'correct_symbol_count': 0,197 'edge_count': 0,198 'correct_edge_count': 0,199 'src_node_count': 0,200 'correct_src_node_count': 0,201 'attn_relevant_items': 0,202 'attn_block1_abs_diff': 0,203 'attn_block2_abs_diff': 0,204 'attn_block3_abs_diff': 0205 }206 logging.info(f"\nEPOCH: {epoch}")207 for i, data_batch in enumerate(tqdm(trainloader)):208 data_batch = create_attn_gt(data_batch, self.end_node_token_id)209 data_batch = data_batch.to(self.device)210 optimizer.zero_grad()211 out = self.model(data_batch)212 loss = calculate_loss(out, self.end_node_token_id, self.device, self.writer, loss_config=loss_config, writer_idx=epoch * len(trainloader) + i)213 loss.backward()214 # gradient clipping215 torch.nn.utils.clip_grad_value_(self.model.parameters(), clip_value=5.0)216 optimizer.step()217 # training evaluation218 batch_stats = self.evaluate_training(out.detach())219 epoch_stats['symbol_count'] += batch_stats['symbol_count']220 epoch_stats['correct_symbol_count'] += batch_stats['correct_symbol_count']221 epoch_stats['edge_count'] += batch_stats['edge_count']222 epoch_stats['correct_edge_count'] += batch_stats['correct_edge_count']223 epoch_stats['src_node_count'] += batch_stats['src_node_count']224 epoch_stats['correct_src_node_count'] += batch_stats['correct_src_node_count']225 epoch_stats['attn_relevant_items'] += batch_stats['attn_relevant_items']226 epoch_stats['attn_block1_abs_diff'] += batch_stats['attn_block1_abs_diff']227 epoch_stats['attn_block2_abs_diff'] += batch_stats['attn_block2_abs_diff']228 epoch_stats['attn_block3_abs_diff'] += batch_stats['attn_block3_abs_diff']229 if self.writer:230 self.writer.add_scalar('ItemAttnMeanAbsDiff_blk1/train', batch_stats['attn_block1_mean_abs_diff'], epoch * len(trainloader) + i)231 self.writer.add_scalar('ItemAttnMeanAbsDiff_blk2/train', batch_stats['attn_block2_mean_abs_diff'], epoch * len(trainloader) + i)232 self.writer.add_scalar('ItemAttnMeanAbsDiff_blk3/train', batch_stats['attn_block3_mean_abs_diff'], epoch * len(trainloader) + i)233 epoch_loss += loss.item()234 if self.writer:235 self.writer.add_scalar('ItemLoss/train', loss.item(), epoch * len(trainloader) + i)236 logging.info(f" epoch loss total: {epoch_loss}")237 logging.info(f" epoch loss avg: {epoch_loss/len(trainloader)/batch_size}")238 if self.writer:239 self.writer.add_scalar('EpochLossTotal/train', epoch_loss, epoch)240 self.writer.add_scalar('EpochLossAvg/train', epoch_loss/len(trainloader)/batch_size, epoch)241 if save_checkpoint_each_nth_epoch != 0 and epoch % save_checkpoint_each_nth_epoch == save_checkpoint_each_nth_epoch - 1:242 # save model checkpoint243 save_model_check_name = self.model_name + '_' + str(epoch) + '.pth'244 torch.save(self.model.state_dict(), os.path.join(save_model_dir, save_model_check_name))245 epoch_stats['symbol_acc'] = epoch_stats['correct_symbol_count'] / epoch_stats['symbol_count'] if epoch_stats['symbol_count'] > 0 else 0246 epoch_stats['edge_acc'] = epoch_stats['correct_edge_count'] / epoch_stats['edge_count'] if epoch_stats['edge_count'] > 0 else 0247 epoch_stats['src_symbol_acc'] = epoch_stats['correct_src_node_count'] / epoch_stats['src_node_count'] if epoch_stats['src_node_count'] > 0 else 0248 epoch_stats['attn_block1_mean_abs_diff'] = epoch_stats['attn_block1_abs_diff'] / epoch_stats['attn_relevant_items'] if epoch_stats['attn_relevant_items'] > 0 else 0249 epoch_stats['attn_block2_mean_abs_diff'] = epoch_stats['attn_block2_abs_diff'] / epoch_stats['attn_relevant_items'] if epoch_stats['attn_relevant_items'] > 0 else 0250 epoch_stats['attn_block3_mean_abs_diff'] = epoch_stats['attn_block3_abs_diff'] / epoch_stats['attn_relevant_items'] if epoch_stats['attn_relevant_items'] > 0 else 0251 if self.writer:252 self.writer.add_scalar('SetSymAcc/train', epoch_stats['symbol_acc'], epoch)253 self.writer.add_scalar('SetEdgeAcc/train', epoch_stats['edge_acc'], epoch)254 self.writer.add_scalar('SetSrcSymAcc/train', epoch_stats['src_symbol_acc'], epoch)255 self.writer.add_scalar('SetAttnMeanAbsDiff_blk1/train', epoch_stats['attn_block1_mean_abs_diff'], epoch)256 self.writer.add_scalar('SetAttnMeanAbsDiff_blk2/train', epoch_stats['attn_block2_mean_abs_diff'], epoch)257 self.writer.add_scalar('SetAttnMeanAbsDiff_blk3/train', epoch_stats['attn_block3_mean_abs_diff'], epoch)258 logging.info(f" symbol class acc: {epoch_stats['symbol_acc'] * 100:.3f}%")259 logging.info(f" edge class acc: {epoch_stats['edge_acc'] * 100:.3f}%")260 if self.eval_during_training and epoch % self.eval_train_settings['each_nth_epoch'] == self.eval_train_settings['each_nth_epoch'] - 1:261 # model first evaluation262 self.evaluate(263 self.eval_train_settings['images_root'],264 self.eval_train_settings['inkmls_root'],265 self.eval_train_settings['batch_size'],266 self.writer, epoch,267 self.eval_train_settings['print_stats'],268 self.eval_train_settings['print_item_level_stats'],269 beam_search=self.eval_train_settings['beam_search'],270 beam_width=self.eval_train_settings['beam_width']271 )272 if self.second_eval_during_training and epoch % self.second_eval_train_settings['each_nth_epoch'] == self.second_eval_train_settings['each_nth_epoch'] - 1:273 # model second evaluation274 self.evaluate(275 self.second_eval_train_settings['images_root'],276 self.second_eval_train_settings['inkmls_root'],277 self.second_eval_train_settings['batch_size'],278 self.writer, epoch,279 self.second_eval_train_settings['print_stats'],280 self.second_eval_train_settings['print_item_level_stats'],281 beam_search=self.second_eval_train_settings['beam_search'],282 beam_width=self.second_eval_train_settings['beam_width'],283 eval_id=2284 )285 if save_model_dir is not None and os.path.exists(save_model_dir):286 # save model final state dict287 save_model_final_name = self.model_name + '_final.pth'288 torch.save(self.model.state_dict(), os.path.join(save_model_dir, save_model_final_name))289 logging.info(f"Model saved as: {os.path.join(save_model_dir, save_model_final_name)}")290 def evaluate_training(self, data):291 """292 Model training-time batch evaluation.293 :param data: model output294 :return: stats dictionary295 """296 stats = {}297 # evaluate nodes predictions = symbols298 y_pred = torch.argmax(data.y_score, dim=1)299 target_symbols = data.tgt_y300 predicted_symbols = y_pred301 stats['symbol_count'] = target_symbols.shape[0]302 stats['correct_symbol_count'] = torch.sum((target_symbols == predicted_symbols))303 # evaluate edges predictions304 tgt_edge_pc_indices = ((data.tgt_edge_type == SltEdgeTypes.PARENT_CHILD).nonzero(as_tuple=True)[0])305 tgt_pc_edge_relation = data.tgt_edge_relation[tgt_edge_pc_indices]306 out_pc_edge_relation = data.y_edge_rel_score[tgt_edge_pc_indices]307 out_pc_edge_relation = out_pc_edge_relation.argmax(dim=-1)308 stats['edge_count'] = tgt_pc_edge_relation.shape[0]309 stats['correct_edge_count'] = torch.sum((tgt_pc_edge_relation == out_pc_edge_relation))310 # evaluate src symbol prediction311 x_pred = torch.argmax(data.x_score, dim=1)312 x_gt_node = data.attn_gt.argmax(dim=0)313 x_gt = data.tgt_y[x_gt_node]314 stats['src_node_count'] = x_gt.shape[0]315 stats['correct_src_node_count'] = torch.sum(x_gt == x_pred)316 # evaluate attention accuracy317 alpha_batch_mask = (data.y_batch.unsqueeze(1) - data.x_batch.unsqueeze(0) != 0).long()318 no_end_node_indices = (data.tgt_y != self.end_node_token_id)319 no_end_node_mask = no_end_node_indices.unsqueeze(1).repeat(1, data.x.shape[0])320 relevant_attn_mask = torch.logical_and(alpha_batch_mask, no_end_node_mask)321 relevant_items_count = torch.sum(relevant_attn_mask.long())322 attn_gt = data.attn_gt323 block1_attn = data.gcn1_alpha * relevant_attn_mask324 block2_attn = data.gcn2_alpha * relevant_attn_mask325 block3_attn = data.gcn3_alpha * relevant_attn_mask326 block1_abs_diff = torch.abs(attn_gt - block1_attn).sum()327 block2_abs_diff = torch.abs(attn_gt - block2_attn).sum()328 block3_abs_diff = torch.abs(attn_gt - block3_attn).sum()329 block1_mean_abs_diff = block1_abs_diff / relevant_items_count330 block2_mean_abs_diff = block2_abs_diff / relevant_items_count331 block3_mean_abs_diff = block3_abs_diff / relevant_items_count332 stats['attn_relevant_items'] = relevant_items_count333 stats['attn_block1_abs_diff'] = block1_abs_diff334 stats['attn_block2_abs_diff'] = block2_abs_diff335 stats['attn_block3_abs_diff'] = block3_abs_diff336 stats['attn_block1_mean_abs_diff'] = block1_mean_abs_diff337 stats['attn_block2_mean_abs_diff'] = block2_mean_abs_diff338 stats['attn_block3_mean_abs_diff'] = block3_mean_abs_diff339 return stats340 def evaluate(self, images_root, inkmls_root, batch_size=1, writer=False, epoch=None, print_stats=True,341 print_item_level_stats=False, store_results_dir=None, results_author='', beam_search=True, beam_width=3, eval_id=""):342 """343 Evaluation of model.344 :param images_root: images folder345 :param inkmls_root: inkml files folder346 :param batch_size: batch size (has to be 1 if beam search)347 :param writer: summary writer object348 :param epoch: training epoch if during training349 :param print_stats: print statistics to STDOUT350 :param print_item_level_stats: print item level statistict to STDOUT351 :param store_results_dir: folder where recognition results shall be stored (LaTeX strings)352 :param results_author: results author signature353 :param beam_search: if True Beam search used to generate output graph, Greedy search otherwise354 :param beam_width: beam width of beam search355 :param eval_id: distinction of evaluation configuration for summary writer356 :return:357 """358 logging.info("\nEvaluation...")359 if beam_search:360 logging.info(f"Beam search with beam width: {beam_width}")361 else:362 logging.info(f"Greedy search")363 self.model.eval()364 if store_results_dir is None or not os.path.exists(store_results_dir):365 store_results_dir = None366 # load data367 testset = CrohmeDataset(images_root, inkmls_root, self.tokenizer, self.components_shape, self.temp_path, self.substitute_terms)368 testloader = DataLoader(testset, batch_size, False, follow_batch=['x', 'tgt_y', 'gt', 'gt_ml', 'filename'])369 # init statistics370 stats = {371 'exact_match': 0,372 'exact_match_1': 0,373 'exact_match_2': 0,374 'exact_match_3': 0,375 'exact_match_pct': 0,376 'exact_match_1_pct': 0,377 'exact_match_2_pct': 0,378 'exact_match_3_pct': 0,379 'structure_match': 0,380 'structure_match_pct': 0,381 'edit_distances_seq': [],382 'edit_distance_seq_avg': 0383 }384 with torch.no_grad():385 for i, data_batch in enumerate(tqdm(testloader)):386 data_batch = create_attn_gt(data_batch, self.end_node_token_id)387 data_batch = data_batch.to(self.device)388 out = self.model(data_batch, beam_search, beam_width)389 if self.device == torch.device('cuda'):390 out = out.cpu()391 # split result batch to separate data elements392 out_elems = split_databatch(out)393 for out_elem in out_elems:394 # evaluate each batch item separately395 item_stats = compute_single_item_stats(out_elem, self.tokenizer)396 stats['edit_distances_seq'].append(item_stats['edit_distance_seq'])397 stats['structure_match'] += 1 if item_stats['slt_diff']['structure_match'] else 0398 stats['exact_match'] += 1 if item_stats['slt_diff']['exact_match'] else 0399 stats['exact_match_1'] += 1 if item_stats['slt_diff']['exact_match_1'] else 0400 stats['exact_match_2'] += 1 if item_stats['slt_diff']['exact_match_2'] else 0401 stats['exact_match_3'] += 1 if item_stats['slt_diff']['exact_match_3'] else 0402 if print_item_level_stats:403 seq_symbol_accuracy = 0404 if item_stats['seq_symbols_count'] > 0:405 seq_symbol_accuracy = item_stats['seq_correct_symbols_count'] / item_stats['seq_symbols_count']406 logging.info(f"\n gt symbols: {item_stats['gt_node_symbols']}")407 logging.info(f" pr symbols: {item_stats['pred_node_symbols']}")408 logging.info(f" seq-sym-acc: {seq_symbol_accuracy * 100:.5f}%")409 logging.info(f" gt latex: {item_stats['latex_gt']}")410 logging.info(f" pr latex: {item_stats['latex_pred']}")411 logging.info(f" e-distance: {item_stats['edit_distance_seq']}")412 logging.info(f" SLT struct-match: {item_stats['slt_diff']['structure_match']}")413 logging.info(f" SLT exact-match: {item_stats['slt_diff']['exact_match']}")414 logging.info(f" SLT exact-match-1: {item_stats['slt_diff']['exact_match_1']}")415 logging.info(f" SLT exact-match-2: {item_stats['slt_diff']['exact_match_2']}")416 logging.info(f" SLT exact-match-3: {item_stats['slt_diff']['exact_match_3']}")417 logging.info(f" SLT sym-cls-err: {item_stats['slt_diff']['node_class_errors']}")418 logging.info(f" SLT edge-cls-err: {item_stats['slt_diff']['edge_class_errors']}")419 if store_results_dir is not None:420 create_latex_result_file(store_results_dir, out_elem.filename, item_stats['latex_pred'], results_author)421 if writer and epoch is not None:422 self.writer.add_scalar('ItemEditDistSeq/eval', item_stats['edit_distance_seq'], epoch + len(testloader) + i)423 stats['exact_match_pct'] = stats['exact_match'] / len(testset) if len(testset) > 0 else 0424 stats['exact_match_1_pct'] = stats['exact_match_1'] / len(testset) if len(testset) > 0 else 0425 stats['exact_match_2_pct'] = stats['exact_match_2'] / len(testset) if len(testset) > 0 else 0426 stats['exact_match_3_pct'] = stats['exact_match_3'] / len(testset) if len(testset) > 0 else 0427 stats['structure_match_pct'] = stats['structure_match'] / len(testset) if len(testset) > 0 else 0428 stats['edit_distances_seq_avg'] = np.asarray(stats['edit_distances_seq']).mean()429 if print_stats:430 logging.info(f" exact-match: {stats['exact_match_pct']*100:.3f}% = {stats['exact_match']}")431 logging.info(f" exact-match -1: {stats['exact_match_1_pct']*100:.3f}% = {stats['exact_match_1']}")432 logging.info(f" exact-match -2: {stats['exact_match_2_pct']*100:.3f}% = {stats['exact_match_2']}")433 logging.info(f" exact-match -3: {stats['exact_match_3_pct']*100:.3f}% = {stats['exact_match_3']}")434 logging.info(f" struct-match: {stats['structure_match_pct']*100:.3f}% = {stats['structure_match']}")435 logging.info(f" e-dist avg: {stats['edit_distances_seq_avg']:.3f}")436 if writer and epoch is not None:437 self.writer.add_scalar('SetExactMatch/eval' + str(eval_id), stats['exact_match_pct'], epoch)438 self.writer.add_scalar('SetExactMatch-1/eval' + str(eval_id), stats['exact_match_1_pct'], epoch)439 self.writer.add_scalar('SetExactMatch-2/eval' + str(eval_id), stats['exact_match_2_pct'], epoch)440 self.writer.add_scalar('SetExactMatch-3/eval' + str(eval_id), stats['exact_match_3_pct'], epoch)441 self.writer.add_scalar('SetStructMatch/eval' + str(eval_id), stats['structure_match_pct'], epoch)442 self.writer.add_scalar('SetEditDistSeqAvg/eval' + str(eval_id), stats['edit_distances_seq_avg'], epoch)443 self.model.train()...

Full Screen

Full Screen

Louvain.py

Source:Louvain.py Github

copy

Full Screen

1# -*- coding: utf-8 -*-2"""3Created on Tue Aug 25 15:37:44 20204Louvain community detection method5Scoring method to see how well the correlations plus Louvain re-create the original populations6@author: owen7"""8import pointwise_correlation as pc9import community as community_louvain10import matplotlib.pyplot as plt11import matplotlib.cm as cm12import networkx as nx13import pandas as pd14import numpy as np15import collections16import scipy.stats as stats17pd.set_option("display.precision", 3)18pd.set_option("display.max_rows", 25)19pd.set_option("display.expand_frame_repr", False)20class dodgy_indices_analysis():21 22 def __init__(self,dodgy_indices=[]):23 self.verbose = True24 self.index_df=pd.read_csv("{0}".format(pc.INDEX_FILE),index_col=0)25 if not len(dodgy_indices):26 self.dodgy_indices = [527, 545, 707, 1136, 1308, 1649, 1701, 1803, 1884, 1994, 2493, 2535]27 self.columns = ['index','pop_id','expected_pop_size','actual_pop_size',28 'expected_comparisons','actual_comparisons',29 'event_beta', 'length of time series']30 self.results = pd.DataFrame(columns = self.columns)31 self.next_index = 032 self.iterate_over_dodgy_indices()33 34 35 def analyse_inconsistent_metaparams(self,v1_df,v2_df,params,meta_params):36 # measure number of comparisons within each population from raw data37 # determine which population is missing comparisons38 # look for unusual parameter values for this population39 40 v1_matching = v1_df.loc[v1_df['name1'] == v1_df['name2']]41 42 #print(v1_matching.head(10))43 # iterate through each population in the metaparams index44 for pop in meta_params.index[:-2]:45 n = meta_params.loc[pop,'n']46 expected_comparisons = int(n*(n-1)/2)47 # select only this population from the matching dfs48 # make sure type of population name is going to work in dataframe49 try:50 pop_name = int(pop)51 except ValueError:52 pop_name = pop53 54 pop_comparisons = len(v1_matching.loc[v1_matching['name1'] == pop_name].index)55 if pop_comparisons:56 actual_pop_size = 257 while actual_pop_size*(actual_pop_size-1)/2 < pop_comparisons:58 actual_pop_size+=159 else:60 actual_pop_size = 161 62 beta = params.loc[pop,'prior process beta']63 length = meta_params.loc[pop,'T']64 65 self.results.loc[self.next_index,self.columns] = [self.current_index,pop,n,actual_pop_size,66 expected_comparisons,pop_comparisons,67 beta,length] 68 self.next_index += 169 if not pop_comparisons == expected_comparisons:70 if self.verbose:71 print("Expected {0} comparisons from population size {1}".format(expected_comparisons,n))72 print("Found {0} comparisons for population id {1}".format(pop_comparisons,pop))73 print(meta_params.loc[pop, 'T'])74 print(params.loc[pop, 'prior process beta'])75 print(params.loc[pop+"_noise" , 'betas'])76 77 78 def check_consistency_of_population_sizes(self,raw_data_dir):79 # load in all relevant data in the directory80 meta_params = pd.read_csv("{0}/meta_params.csv".format(raw_data_dir), index_col = 0)81 params = pd.read_csv("{0}/population_parameters.csv".format(raw_data_dir), index_col = 0)82 v1_df = pd.read_csv("{0}/{1}".format(raw_data_dir,'sigma_v1_correlations.csv'),index_col = 0)83 v2_df = pd.read_csv("{0}/{1}".format(raw_data_dir,'sigma_v2_correlations.csv'),index_col = 0)84 85 #print(pd.DataFrame(meta_params))86 pop_sizes = meta_params['n'].dropna()87 #print(meta_params['T'])88 #print(pop_sizes)89 k=sum(pop_sizes)90 #k = sum([pop_sizes.loc[i] for i in pop_sizes.index])91 expected_comparisons = int((k)*(k-1)/2)92 assert len(v1_df.index) == len(v2_df.index)93 self.analyse_inconsistent_metaparams(v1_df,v2_df,params,meta_params)94 95 if not len(v1_df.index) == expected_comparisons:96 return False97 98 return True 99 100 def iterate_over_dodgy_indices(self):101 for self.current_index in self.dodgy_indices:102 data_dir = self.index_df.loc[self.current_index,'raw data directory']103 if self.verbose and not self.check_consistency_of_population_sizes(data_dir):104 print("Inconsistency in index {0} - details above".format(self.current_index))105class Louvain_methods():106 107 def __init__(self,df_results, version = 'Unknown', p_values_graph_setup_option = 'weights', resolution = 1, recursion_level = 0,108 show_dfs = False, store_results_dir= "", Louvain_version = '1', verbose=False):109 """110 Parameters111 ----------112 df_results : TYPE DataFrame113 DESCRIPTION. df_results must have the following columns:114 'object1' giving the id of the first object115 'object2' giving the id of the second object116 'name1' giving the name of the population of object1117 'name2' giving the name of the population of object2118 'p_value' giving the probabilty of correlation between the two objects119 p_values_graph_setup_option : TYPE string120 DESCRIPTION options are 'random' to initalise complete graph with random edge weights;121 'weights' to initialise complete graph with correlation p-values as edge weights122 'edges' to selecte each edge (unweighted) with probability given by correlation p-value123 resolution : TYPE float, optional124 DESCRIPTION. The default is 1. Passed directly to community_Louvain methods125 show_dfs : TYPE boolean, optional126 DESCRIPTION. The default is False.127 store_results_dir : TYPE string, optional128 DESCRIPTION. The default is "". If passed, f-scores and confusion matrices are stored129 version : TYPE string, optional130 DESCRIPTION. The default is None. Options are 'v1' for sigma version 1 or 'v2' for sigma version 2131 Louvain_version : TYPE string, optional132 DESCRIPTION. The default is '1', referring to standard Louvain method maximising modularity133 If this is set to '2', the level of the dendrogram giving the nearest number of clusters to the number of populations is selected134 If this is set to '3', the standard Louvain method is run; then on each partition found, Louvain is run again135 If this is set to '4', Louvain is run recursively on each partition found until there is no change in the total number of partitions136 137 verbose : TYPE, optional138 DESCRIPTION. The default is False.139 Returns140 -------141 None.142 """143 144 self.graph_setup = p_values_graph_setup_option145 self.verbose = verbose146 self.Louvain_version = Louvain_version147 self.sigma_version = version148 self.show_dfs = show_dfs149 self.store_results_dir = store_results_dir150 self.resolution = resolution151 self.recursion_level = recursion_level152 153 # randomise order of results154 self.df_input_correlations = df_results.sample(frac=1)155 # create lists of the population names and the object ids (nodes)156 self.names=list(set(self.df_input_correlations['name1']).union(set(self.df_input_correlations['name2'])))157 self.nodes=list(set(self.df_input_correlations['object1']).union(set(self.df_input_correlations['object2'])))158 if verbose:159 print("There are {0} populations and a total of {1} individuals".format(len(self.names), len(self.nodes)))160 161 # create dictionary to look up the population name for any given object id (node)162 #name_of_node={self.df_input_correlations.loc[i,'object1']:self.df_input_correlations.loc[i,'name1'] for i in self.df_input_correlations.index}163 self.name_of_node = {}164 pop_sizes = collections.defaultdict(int)165 for i in self.df_input_correlations.index:166 id1 = self.df_input_correlations.loc[i,'object1']167 name1 = self.df_input_correlations.loc[i,'name1']168 id2 = self.df_input_correlations.loc[i,'object2']169 name2 = self.df_input_correlations.loc[i,'name2'] 170 171 # test consistency of populations and names172 if id1 in self.name_of_node.keys():173 assert(self.name_of_node[id1] == name1) 174 else:175 pop_sizes[name1] += 1176 self.name_of_node[id1] = name1177 178 if id2 in self.name_of_node.keys():179 assert(self.name_of_node[id2] == name2) 180 else:181 pop_sizes[name2] += 1182 self.name_of_node[id2] = name2183 184 185 if self.verbose:186 print("Population sizes based on names in dataframe are {0}".format(pd.DataFrame.from_dict(pop_sizes, orient = 'index').sort_index()))187 188 self.initialise_graph()189 self.make_partition()190 191 192 def initialise_graph(self):193 #set up graph with time series as nodes and edge weights given by correlations194 self.graph=nx.Graph()195 self.graph.add_nodes_from(self.nodes)196 if self.graph_setup == 'random':197 if self.verbose:198 print("Testing random graph")199 self.graph.add_edges_from([(self.df_input_correlations.loc[i]['object1'],self.df_input_correlations.loc[i]['object2'],200 {'weight': np.random.random()}) 201 for i in self.df_input_correlations.index])202 203 elif self.graph_setup == 'weights':204 self.graph.add_edges_from([(self.df_input_correlations.loc[i]['object1'],self.df_input_correlations.loc[i]['object2'],205 {'weight': self.df_input_correlations.loc[i]['p-value']}) 206 for i in self.df_input_correlations.index])207 if self.verbose:208 print("Passing p-values as weights for each edge")209 elif self.graph_setup == 'edges':210 self.graph.add_edges_from([(self.df_input_correlations.loc[i]['object1'],self.df_input_correlations.loc[i]['object2'])211 for i in self.df_input_correlations.index if np.random.random()<self.df_input_correlations.loc[i]['p-value']])212 if self.verbose:213 print("Assigning each edge with probability given by corresponding correlation p-value")214 def make_partition(self):215 if self.Louvain_version == '1':216 # maximise modularity using greedy algorithm217 self.partition = community_louvain.best_partition(self.graph,weight='weight',resolution = self.resolution)218 elif self.Louvain_version == '2':219 # choose the partition with the best number of clusters220 dendro = community_louvain.generate_dendrogram(self.graph)221 # default to the lowest level (largest number of clusters)222 level = 0223 self.partition = community_louvain.partition_at_level(dendro,level)224 225 # if a better partition (closer to the correct number of populations) can be found then use it instead226 for level in range(len(dendro)-1):227 current_partition = community_louvain.partition_at_level(dendro,level)228 #print("Partition at level {0} is {1}".format(level,current_partition))229 clusters = len(set(current_partition.values()))230 if clusters<len(self.names):231 break232 233 if level:234 last_partition = community_louvain.partition_at_level(dendro,level-1)235 last_clusters = len(set(last_partition.values()))236 if abs(len(self.names)-clusters)<abs(last_clusters - len(self.names)):237 self.partition = current_partition238 else:239 self.partition = last_partition240 241 elif self.Louvain_version == '3':242 # find best partition243 self.partition = community_louvain.best_partition(self.graph,weight='weight',resolution = self.resolution)244 node_groupings = self.split_partition_into_nodes()245 recursively_analyse_subgraphs(self, to_depth = 1)246 247 elif self.Louvain_version == '4':248 # find best partition249 self.partition = community_louvain.best_partition(self.graph,weight='weight',resolution = self.resolution)250 self.node_groupings = self.split_partition_into_nodes()251 self.recursively_analyse_subgraphs(self)252 253 def split_partition_into_nodes(self):254 if self.verbose:255 print("The set of partitions is {0}".format(set(self.partition.values())))256 # create a list of each set of nodes that corresponds to a separate partitions257 node_groupings = [[node for node in self.partition.keys() if self.partition[node] == subgraph_number]258 for subgraph_number in set(self.partition.values())]259 if self.verbose:260 print("The partition is {0}".format(self.format_partition(self.partition)))261 print("The node-groupings are therefore {0}".format(self.format_partition(node_groupings)))262 assert(len(self.nodes) == sum([len(nodes) for nodes in node_groupings]))263 264 return node_groupings265 266 def recursively_analyse_subgraphs(self, to_depth = None):267 if self.verbose:268 print("Starting recursion level {0}".format(self.recursion_level))269 # if the graph has only one partition, or the desired recursion level has been reached,break out of the recursion270 if to_depth == self.recursion_level or len(self.node_groupings) == 1:271 if self.verbose:272 print("Returning from recursion level {0}".format(self.recursion_level))273 return None 274 275 # iterate through each group of nodes and instantiate a new Louvain_methods object for each276 new_partition = {}277 partition_number = 0278 for i,nodes in enumerate(self.node_groupings):279 df = self.df_input_correlations.loc[(self.df_input_correlations['object1'].isin(nodes))280 & (self.df_input_correlations['object2'].isin(nodes))]281 282 sub_Louvain = Louvain_methods(df,p_values_graph_setup_option = self.graph_setup,resolution = self.resolution, 283 recursion_level = self.recursion_level + 1, version = self.sigma_version, 284 Louvain_version = self.Louvain_version, verbose = self.verbose)285 if self.verbose:286 print("Sub-partition {1} found : {0}".format(self.format_partition(sub_Louvain.partition), i))287 288 # update new partition289 for individual in sub_Louvain.partition.keys():290 new_partition[individual] = sub_Louvain.partition[individual] + partition_number291 292 293 partition_number += len(set(sub_Louvain.partition.values()))294 295 if self.verbose:296 print("New partition now looks like: {0}".format(self.format_partition(new_partition)))297 298 self.partition = new_partition299 300 301 def format_partition(self, partition):302 if type(partition) == dict:303 return {str(self.name_of_node[id_code]) + "_" + str(id_code)[-4:] : partition[id_code] for id_code in partition.keys()}304 if type(partition) == list:305 return [[str(self.name_of_node[id_code]) + "_" + str(id_code)[-4:] for id_code in node_group] for node_group in partition]306 307 308 309 310 311 #def make_sub_graph(self, nodes = []):312 # if not len(nodes):313 # nodes = self.nodes314 315 316 def score_partition(self): 317 318 clusters=len(set(self.partition.values()))319 if self.verbose:320 print("Divided into {0} clusters".format(clusters))321 # analyse which cluster contains which members of each population322 cross_reference_dict=collections.defaultdict(lambda: collections.defaultdict(int))323 for node_id in self.partition.keys():324 cross_reference_dict["cluster {0}".format(self.partition[node_id])][self.name_of_node[node_id]]+=1325 326 cross_ref_df = pd.DataFrame.from_dict(cross_reference_dict,orient = 'index')327 if self.show_dfs:328 print(cross_ref_df)329 if len(self.store_results_dir):330 cross_ref_df.to_csv("{0}/{1}confusion_matrix.csv".format(self.store_results_dir,self.sigma_version))331 332 333 334 335 # make f-score based on true positive edges between ids in the same population336 TP=collections.defaultdict(int)337 tp=0338 FP=collections.defaultdict(int)339 fp=0340 FN=collections.defaultdict(int)341 fn=0342 for index in self.df_input_correlations.index:343 id1=self.df_input_correlations.loc[index]['object1']344 pop1=self.df_input_correlations.loc[index]['name1']345 id2=self.df_input_correlations.loc[index]['object2']346 pop2=self.df_input_correlations.loc[index]['name2']347 if pop1==pop2:348 if self.partition[id1]==self.partition[id2]:349 TP[pop1]+=1 # true positive350 tp+=1351 else:352 FN[pop1]+=1 # false negative353 fn+=1354 elif self.partition[id1]==self.partition[id2]:355 FP[pop1]+=1 # false positive - not interested in true negatives356 FP[pop2]+=1 # NB false positive edges double counted, once for each node (population object)357 fp+=1358 359 FP_overall=sum([FP[name] for name in self.names])/2 # halved as double counted within populations360 TP_overall=sum([TP[name] for name in self.names])361 FN_overall=sum([FN[name] for name in self.names])362 assert FP_overall==fp363 assert TP_overall==tp364 assert FN_overall==fn365 366 recalls = {name :TP[name]/(TP[name]+FP[name]) if (TP[name] + FP[name]) else 0 for name in self.names}367 #print(recalls)368 precisions = {name : TP[name]/(TP[name]+FN[name]) if (TP[name] + FN[name]) else 0 for name in self.names}369 f_scores={name : 2*recalls[name]*precisions[name]/(recalls[name]+precisions[name]) if (recalls[name]+precisions[name]) else 0 for name in self.names}370 scores_df = pd.DataFrame([recalls,precisions,f_scores],index=['Recall','Precison','F-score'],columns=[name for name in self.names])371 R=TP_overall/(FP_overall+TP_overall)372 P=TP_overall/(TP_overall+FN_overall)373 f_score=2*R*P/(R+P) 374 scores_df['Overall'] = [R,P,f_score]375 if self.show_dfs:376 print(scores_df)377 if self.store_results_dir:378 scores_df.to_csv("{0}/{1}recall_precision_fscores.csv".format(self.store_results_dir,self.sigma_version))379 380 return {'{0}clusters'.format(self.sigma_version) :[clusters],'{0}recall'.format(self.sigma_version) :[R],'{0}precision'.format(self.sigma_version) : [P],'{0}f_score'.format(self.sigma_version) : [f_score]}381"""382def analyse_raw_results_for_scoring(td_object,reclustering=None,test_random_graph=False,383 repeats=1,pass_weights=True,verbose=False):384 raw_results=td_object.raw_results#[:400]385 if td_object.params.get("Use population means"):386 sigma_version='sigma_v2_'387 else:388 sigma_version='sigma_v1_'389 390 391 392 df_all = pd.DataFrame(raw_results,columns=['Z-score','object1','object2']) 393 # remove infinite/nan values394 df_all = df_all.replace(np.inf,np.nan)395 df_all=df_all.dropna(inplace=False) 396 # find p-values for each z-score397 df_all['p-value']=df_all['Z-score'].map(lambda x: stats.norm.cdf(x))398 399 # Extract the population names, object ids and time series for each time series object400 df_all['name1']=[d.name for d in df_all['object1']]401 df_all['name2']=[d.name for d in df_all['object2']]402 df_all['time series 1']=[d.t_series for d in df_all['object1']]403 df_all['time series 2']=[d.t_series for d in df_all['object2']]404 df_all['id1']=[id(d) for d in df_all['object1']]405 df_all['id2']=[id(d) for d in df_all['object2']]406 names=np.sort(list(set(df_all['name1']).union(set(df_all['name2']))))407 408 df_to_store=df_all.loc[:,['p-value','Z-score','name1','name2','id1','id2']]409 410 # create a group for each population411 df_matching=df_to_store.loc[df_to_store['name1'] == df_to_store['name2']]412 grouped_matching=df_matching.groupby(['name1'],as_index=False)413 df_new=pd.DataFrame()414 for group,frame in grouped_matching:415 #print(frame)416 df_new[group]=frame.mean()417 418 # store non-matching population results in a separate df419 df_non_matching=df_to_store.drop(labels=df_matching.index) 420 names_dict={name:i for i,name in enumerate(names)} # arbitrary order for names421 swap_dict=collections.defaultdict(int)422 for i in df_non_matching.index:423 name1=df_non_matching.loc[i]['name1']424 name2=df_non_matching.loc[i]['name2']425 if names_dict[name1]>names_dict[name2]:426 #swap names so that, for any pair, the order of names is always the same427 swap_dict[name1]+=1428 df_non_matching.at[i,'name1']=name2429 df_non_matching.at[i,'name2']=name1430 # group by both names431 group_non_matching=df_non_matching.groupby(['name1','name2'],as_index=False)432 if verbose:433 assert(len(group_non_matching)==len(names)*(len(names)-1)/2)434 df_non_matching_mean=pd.DataFrame()435 for group,frame in group_non_matching:436 df_non_matching_mean[group]=frame.mean()437 438 439 if verbose:440 print("Z score and corresponding p-value mean results within each population")441 print(df_new)442 print("Z score and corresponding p-value mean results across populations")443 print(df_non_matching_mean.head(2))444 445 z_across=df_non_matching['Z-score']446 z_within=df_matching['Z-score']447 if verbose:448 print("Mean non-matching stats is {0}".format([np.mean(z_across),np.std(z_across)]))449 print("Overall mean Z-score is {0}".format(np.mean(df_all['Z-score'])))450 print("Mean matching stats is {0}".format([np.mean(z_within),np.std(z_within)]))451 452 453 # store dataframes of results in csv files454 455 #df_non_matching.to_csv("{0}\{1}correlations_across_populations.csv".format(pc.TEMP_DIR,sigma_version))456 df_to_store.to_csv("{0}\{1}correlations.csv".format(pc.TEMP_DIR,sigma_version))457 458 if reclustering=='greedy Louvain':459 scores=[]460 j=0461 for i in range(repeats):462 if repeats%(int(repeats/100+1)):463 print("{0}%".format(j),end=',')464 j+=1465 print("\n")466 scores.append(make_partition_and_score(df_all,test_random_graph=test_random_graph,pass_weights=pass_weights,verbose=verbose))467 if repeats:468 print("Over {0} runs of graph and partition, scores are {1}".format(repeats,scores))469 470 return pd.DataFrame.from_dict({"matching": {'mean':np.mean(z_within),471 "std": np.std(z_within)},472 "not matching" : {'mean': np.mean(z_across),473 "std" : np.std(z_across)}474 },orient='index')...

Full Screen

Full Screen

new_scenario.py

Source:new_scenario.py Github

copy

Full Screen

...73 parser_result = e74 scenario_base.store_parser_result(75 scenario_package_dirpath, parser_result,76 options.parser_result_tag)77 scenario_base.store_results_dir(78 scenario_package_dirpath, copied_dirpath)79 scenario_base.write_config(80 scenario_package_dirpath,81 status_version=harness.status_version,82 parser_result_tag=options.parser_result_tag,83 )84 scenario_base.install_unittest_module(85 scenario_package_dirpath, options.template_type)86 tmp_dirpath.clean()87if __name__ == '__main__':...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run autotest automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful