How to use variants method in avocado

Best Python code snippet using avocado_python

variant_caller.py

Source:variant_caller.py Github

copy

Full Screen

...23 op = next(next(cig_iter)[1])24 if op in read_consuming_ops:25 result += length26 return result27def find_variants(sample_in,con_fasta,output):28 bamfile = pysam.AlignmentFile(sample_in)29 con_fasta_dict = {record.id:record for record in SeqIO.parse(con_fasta,'fasta')}30 haplo_dict = {'H1':{'genes':[],'variants':[]},'H2':{'genes':[],'variants':[]}}31 for read in bamfile.fetch():32 haplo_dict[read.reference_name]['genes'].append(read.qname.split('_')[-1])33 test_seq = read.seq34 ref_seq = str(con_fasta_dict[read.reference_name].seq)35 ref_pos = read.reference_start36 test_pos = 037 ref_seq_alligned = ''38 test_seq_alligned = ''39 variants = []40 for match in read.cigar:41 if match[0] == 0: #match42 ref_seq_alligned += ref_seq[ref_pos:ref_pos+match[1]]43 test_seq_alligned += test_seq[test_pos:test_pos+match[1]]44 ref_pos += match[1]45 test_pos += match[1]46 elif match[0] == 1: # insertion47 ref_seq_alligned += '-' * match[1]48 test_seq_alligned += test_seq[test_pos:test_pos+match[1]]49 test_pos += match[1]50 elif match[0] == 2: #deletion51 ref_seq_alligned += ref_seq[ref_pos:ref_pos+match[1]]52 test_seq_alligned += '-' * match[1]53 ref_pos += match[1]54 ref_pos = read.reference_start55 test_pos = 056 convert_table = pd.DataFrame(columns=['con','gene'])57 convert_table_con = []58 convert_table_gene = []59 for i in range(len(ref_seq_alligned)):60 if ref_seq_alligned[i] != test_seq_alligned[i]:61 variants.append([ref_pos,test_pos,ref_seq_alligned[i],test_seq_alligned[i],62 ref_seq_alligned[i-2:i+3],test_seq_alligned[i-2:i+3]])63 if ref_seq_alligned[i] != '-':64 ref_pos += 165 if test_seq_alligned[i] != '-':66 test_pos += 167 convert_table_con.append(ref_pos)68 convert_table_gene.append(test_pos)69 convert_table['con'] = convert_table_con70 convert_table['gene'] = convert_table_gene71 haplo_dict[read.reference_name]['variants'].append({'gene':read.qname.split('_')[-1],72 'position':read.reference_start,73 'stop':read.reference_start + target_len(read.cigarstring),74 'variants':variants,75 'number_variants':len(variants),76 'convert_table':convert_table})77 all_variants = dict()78 gene_dict = dict()79 for haplotype in haplo_dict:80 gene_dict[haplotype] = []81 all_variants[haplotype] = []82 variant_df = pd.DataFrame()83 num_var = 084 for variants in haplo_dict[haplotype]['variants']:85 variant_df.loc[0,num_var]=086 for variant in variants['variants']:87 variant_df.loc[variant[0],num_var]=188 variant_df.loc[variant[0],f'ref_{num_var}']=variant[3]89 variant_df.loc[variant[0],f'alt_{num_var}']=variant[2]90 variant_df.loc[variant[0],f'pos_{num_var}']=variant[1]91 num_var += 192 variant_df = variant_df.sort_index().fillna(0)93 gene_start_stop = dict()94 for i in range(num_var):95 gene_start_stop[i]={'min':0,'max':0}96 gene_start_stop[i]['min']=haplo_dict[haplotype]['variants'][i]['position']-100097 gene_start_stop[i]['max']=haplo_dict[haplotype]['variants'][i]['stop']+100098 gene_num = -199 gene_counter = -1100 for position in variant_df.index[1:]:101 genes = []102 variants = np.array([])103 index = variant_df.index.get_loc(position)104 for gene in gene_start_stop:105 if position >= gene_start_stop[gene]['min'] and position <= gene_start_stop[gene]['max']:106 genes.append(gene)107 variants = np.append(variants,int(sum(variant_df.iloc[max(index-10,0):index+11][gene])))108 if genes[variants.argmin()] != gene_num:109 if gene_num not in genes:110 gene_counter += 1111 gene_dict[haplotype].append(dict(haplo_dict[haplotype]['variants'][genes[variants.argmin()]]))112 elif gene_dict[haplotype][gene_counter]['gene'] != 'hybrid':113 if sum([position < haplo_dict[haplotype]['variants'][x]['position'] for x in genes]) != 0:114 gene_dict[haplotype][gene_counter] = dict(haplo_dict[haplotype]['variants'][genes[variants.argmin()]])115 else:116 gene_dict[haplotype][gene_counter]['gene0'] = str(gene_dict[haplotype][gene_counter]['gene'])117 gene_dict[haplotype][gene_counter]['gene1'] = haplo_dict[haplotype]['variants'][genes[variants.argmin()]]['gene']118 gene_dict[haplotype][gene_counter]['gene'] = 'hybrid'119 gene_dict[haplotype][gene_counter]['variants0'] = gene_dict[haplotype][gene_counter]['variants']120 gene_dict[haplotype][gene_counter]['variants1'] = haplo_dict[haplotype]['variants'][genes[variants.argmin()]]['variants']121 gene_dict[haplotype][gene_counter]['convert_table0'] = gene_dict[haplotype][gene_counter]['convert_table']122 gene_dict[haplotype][gene_counter]['convert_table1'] = haplo_dict[haplotype]['variants'][genes[variants.argmin()]]['convert_table']123 gene_dict[haplotype][gene_counter]['breakpoint0'] = haplo_dict[haplotype]['variants'][genes[variants.argmin()]]['position']124 gene_dict[haplotype][gene_counter]['breakpoint1'] = position125 gene_dict[haplotype][gene_counter]['stop'] = haplo_dict[haplotype]['variants'][genes[variants.argmin()]]['stop']126 gene_dict[haplotype][gene_counter]['counter'] = 1127 else:128 counter = gene_dict[haplotype][gene_counter]['counter'] +1129 gene_dict[haplotype][gene_counter][f'gene{counter}'] = haplo_dict[haplotype]['variants'][genes[variants.argmin()]]['gene']130 gene_dict[haplotype][gene_counter][f'variants{counter}'] = haplo_dict[haplotype]['variants'][genes[variants.argmin()]]['variants']131 gene_dict[haplotype][gene_counter][f'convert_table{counter}'] = haplo_dict[haplotype]['variants'][genes[variants.argmin()]]['convert_table']132 gene_dict[haplotype][gene_counter][f'breakpoint{counter}'] = position133 gene_dict[haplotype][gene_counter]['stop'] = haplo_dict[haplotype]['variants'][genes[variants.argmin()]]['stop']134 gene_dict[haplotype][gene_counter]['counter'] = counter135 gene_num = genes[variants.argmin()]136 variant_df.loc[position,'gene'] = genes[variants.argmin()]137 variant_df.to_csv(f'{output}_{haplotype}.csv')138 for gene in gene_dict[haplotype]:139 if gene['gene'] == 'hybrid':140 for counter in range(gene['counter']):141 for variant in gene[f'variants{counter}']:142 if variant[0] >= gene[f'breakpoint{counter}'] and variant[0] < gene[f'breakpoint{counter+1}']:143 all_variants[haplotype].append(variant)144 for variant in gene[f'variants{counter+1}']:145 if variant[0] >= gene[f'breakpoint{counter+1}']:146 all_variants[haplotype].append(variant)147 else:148 all_variants[haplotype].extend(gene['variants'])149 return all_variants,gene_dict150def decode_variants_gene(151 self, sample, ref_seq, gene_variants, allele, ambig_ref=False, return_all=False):152 """Convert network output in sample to a set of `medaka.vcf.Variant` s.153 A consensus sequence is decoded and compared with a reference sequence.154 Both substitution and indel variants that may be multi-base will be155 reported in the output `medaka.vcf.Variant` s.156 :param sample: `medaka.common.Sample`.157 :param ref_seq: reference sequence, should be upper case158 :param ambig_ref: bool, if True, decode variants at ambiguous (N)159 reference positions.160 :param return_all: bool, emit VCF records with `'.'` ALT for reference161 positions which contain no variant.162 :returns: list of `medaka.vcf.Variant` objects.163 """164 logger = medaka.common.get_named_logger('Variants')165 if sample.positions['minor'][0] != 0:166 raise ValueError(167 "The first position of a sample must not be an insertion.")168 # TODO: the code below uses all of: unicode arrays, python strings,169 # and integer encodings. This could be simplified170 pos = sample.positions171 probs = sample.label_probs172 encoding = self._encoding173 # array of symbols retaining gaps174 predicted = self.decode_consensus(sample, with_gaps=True, dtype='|U1')175 # get reference sequence with insertions marked as '*'176 reference = np.full(len(pos), "*", dtype="|U1")177 reference[pos['minor'] == 0] = np.fromiter(178 ref_seq[pos['major'][0]:pos['major'][-1] + 1],179 dtype='|U1')180 # change reference gene sequence181 ref_pos = pos['major'][0] -1182 t1 = -1183 test_teller = 0184 while test_teller < len(gene_variants) and gene_variants[test_teller][0] < ref_pos:185 test_teller += 1186 other_variants = []187 for nuc in reference:188 if test_teller < len(gene_variants):189 t1 += 1190 if nuc != '*':191 ref_pos += 1192 if gene_variants[test_teller][2] == '-':193 if ref_pos == gene_variants[test_teller][0]-1:194 if reference[t1+1] == '*':195 reference[t1+1] = gene_variants[test_teller][3]196 ref_pos -= 1197 elif reference[t1+1] == gene_variants[test_teller][3] and reference[t1+2] == '*':198 reference[t1+2] = gene_variants[test_teller][3]199 ref_pos -= 1200 else:201 logger.info('insert error')202 other_variants.append([allele]+gene_variants[test_teller])203 test_teller += 1204 while test_teller < len(gene_variants) and gene_variants[test_teller][0]-1 == ref_pos:205 other_variants.append([allele]+gene_variants[test_teller])206 test_teller += 1207 test_teller -= 1208 test_teller += 1209 else:210 if ref_pos == gene_variants[test_teller][0]:211 if reference[t1] == gene_variants[test_teller][2]:212 reference[t1] = gene_variants[test_teller][3]213 else:214 logger.info('error')215 other_variants.append([allele]+gene_variants[test_teller])216 test_teller += 1217 # find variant columns using prescription218 is_variant = self._find_variants(pos['minor'], reference, predicted)219 variants = []220 runs = medaka.rle.rle(is_variant)221 runs = runs[np.where(runs['value'])]222 for rlen, rstart, is_var in runs:223 rend = rstart + rlen224 # get the ref/predicted sequence with/without gaps225 var_ref_with_gaps = ''.join(s for s in reference[rstart:rend])226 var_pred_with_gaps = ''.join(s for s in predicted[rstart:rend])227 var_ref = var_ref_with_gaps.replace('*', '')228 var_pred = var_pred_with_gaps.replace('*', '')229 # del followed by insertion can lead to non-variant230 # maybe skip things if reference contains ambiguous symbols231 if var_ref == var_pred and is_var:232 continue233 elif (not ambig_ref and234 not set(var_ref).issubset(set(self.symbols))):235 continue236 # As N is not in encoding, encode N as *237 # This only affects calculation of quals.238 var_ref_encoded = (239 encoding[(s if s != 'N' else '*',)]240 for s in var_ref_with_gaps)241 var_pred_encoded = (242 encoding[(s,)] for s in var_pred_with_gaps)243 # calculate probabilities244 var_probs = probs[rstart:rend]245 ref_probs = np.array(246 [var_probs[i, j] for i, j in enumerate(var_ref_encoded)])247 pred_probs = np.array(248 [var_probs[i, j] for i, j in enumerate(var_pred_encoded)])249 ref_quals = self._phred(1.0 - ref_probs)250 pred_quals = self._phred(1.0 - pred_probs)251 info = dict()252 if self.verbose:253 info = {254 'ref_seq': var_ref_with_gaps,255 'pred_seq': var_pred_with_gaps,256 'ref_qs': ','.join((self._pfmt(q) for q in ref_quals)),257 'pred_qs': ','.join((self._pfmt(q) for q in pred_quals)),258 'ref_q': self._pfmt(sum(ref_quals)),259 'pred_q': self._pfmt(sum(pred_quals)),260 'n_cols': len(pred_quals)}261 # log likelihood ratio262 qual = sum(pred_quals) - sum(ref_quals)263 genotype = {'GT': '1', 'GQ': self._pfmt(qual, 0)}264 # position in reference sequence265 var_pos = pos['major'][rstart]266 if pos['minor'][rstart] != 0:267 # variant starts on insert, prepend ref base (normalization268 # doesnt handle this)269 var_ref = ref_seq[var_pos] + var_ref270 var_pred = ref_seq[var_pos] + var_pred271 # create the variant record and normalize272 variant = medaka.vcf.Variant(273 sample.ref_name, var_pos, var_ref,274 alt=var_pred, filt='PASS', info=info, qual=self._pfmt(qual),275 genotype_data=genotype)276 variant = variant.normalize(reference=ref_seq)277 variants.append(variant)278 if return_all:279 # to avoid complications from deletions and multi-reference spans,280 # we simply output a record for every minor == 0 position281 sites = pos['minor'] == 0282 _pos = pos['major'][sites]283 _probs = probs[sites]284 _ref = reference[sites]285 _enc = [encoding[(s if s != 'N' else '*',)] for s in _ref]286 _quals = self._phred(287 1.0 - np.array(_probs[np.arange(_probs.shape[0]), _enc]))288 _quals_flt = np.char.mod("%.3f", _quals)289 _quals_int = np.char.mod("%d", np.rint(_quals))290 info = dict()291 for p, base, qf, qi in zip(_pos, _ref, _quals_flt, _quals_int):292 genotype = medaka.vcf.GenotypeData(GT='0', GQ=qi)293 variants.append(294 medaka.vcf.Variant(295 sample.ref_name, p, base,296 alt='.', filt='.', info=info, qual=qf,297 genotype_data=genotype))298 variants.sort(key=lambda x: x.pos)299 return variants,other_variants300class ConvertCoordinates():301 logger = medaka.common.get_named_logger('Variants')302 303 _exon_table = pd.DataFrame()304 def _load_exon_data(self,transcript_name):305 if self._exon_table.empty:306 df = read_gtf(self.args.ref_gtf)307 self._exon_table = df[df['feature'] == 'exon']308 df_gene = self._exon_table[self._exon_table['transcript_name'] == transcript_name]309 exon_dict = dict()310 for row in df_gene.index:311 exon_dict[df_gene.loc[row,'exon_number']] = {'start':df_gene.loc[row,'start'],312 'stop': df_gene.loc[row,'end']}313 return exon_dict314 def _star_alleles(self):315 variants = pd.read_csv(self.args.star_alleles,sep='\t',header=1)316 self.variant_dict = dict()317 for row in variants.index:318 if variants.loc[row,'Variant Start']!= '.':319 variant_start = int(variants.loc[row,'Variant Start'])320 if variant_start not in self.variant_dict:321 self.variant_dict[variant_start] = {'ref':variants.loc[row,'Reference Allele'],322 'alt':{variants.loc[row,'Variant Allele']:[variants.loc[row,'Haplotype Name']]}}323 elif variants.loc[row,'Variant Allele'] not in self.variant_dict[variant_start]['alt']:324 self.variant_dict[variant_start]['alt']=variants.loc[row,'Haplotype Name']325 else:326 self.variant_dict[variant_start]['alt'][variants.loc[row,'Variant Allele']].append(variants.loc[row,'Haplotype Name'])327 self.variant_dict_star = dict()328 for row in variants.index:329 if variants.loc[row,'Variant Start'] != '.':330 variant_haplotype_name = variants.loc[row,'Haplotype Name']331 if variant_haplotype_name not in self.variant_dict_star:332 self.variant_dict_star[variant_haplotype_name] = [(int(variants.loc[row,'Variant Start']),variants.loc[row,'Reference Allele'],variants.loc[row,'Variant Allele'])]333 else:334 self.variant_dict_star[variant_haplotype_name].append((int(variants.loc[row,'Variant Start']),variants.loc[row,'Reference Allele'],variants.loc[row,'Variant Allele']))335 def _convert_extra_varaints(self):336 extra_variants = pd.read_csv(f'{self.args.output}_extra.txt',sep='\t')337 position = -1338 new_row = -1339 exta_variants_converted = pd.DataFrame(columns = ['#CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','SAMPLE'])340 for row in extra_variants.index:341 if extra_variants.loc[row,'con_pos'] == position:342 exta_variants_converted.loc[new_row,'REF'] += extra_variants.loc[row,'gene_env'][2].replace('-','')343 exta_variants_converted.loc[new_row,'ALT'] += extra_variants.loc[row,'con_env'][2].replace('-','')344 elif extra_variants.loc[row,'gene_env'][1] == '-':345 if extra_variants.loc[row,'gene_env'][0] != '-':346 new_row += 1347 exta_variants_converted.loc[new_row,'#CHROM'] = extra_variants.loc[row,'allele']348 exta_variants_converted.loc[new_row,'POS'] = extra_variants.loc[row,'con_pos']349 position = extra_variants.loc[row,'con_pos']350 exta_variants_converted.loc[new_row,'REF'] = extra_variants.loc[row,'gene_env'][0:3].replace('-','')351 exta_variants_converted.loc[new_row,'ALT'] = extra_variants.loc[row,'con_env'][0:3].replace('-','')352 else:353 print('variant error')354 os.exit(1)355 elif extra_variants.loc[row,'con_env'][1] == '-':356 if extra_variants.loc[row,'con_env'][0] != '-':357 new_row += 1358 exta_variants_converted.loc[new_row,'#CHROM'] = extra_variants.loc[row,'allele']359 exta_variants_converted.loc[new_row,'POS'] = extra_variants.loc[row,'con_pos']360 position = extra_variants.loc[row,'con_pos']361 exta_variants_converted.loc[new_row,'REF'] = extra_variants.loc[row,'gene_env'][0:3].replace('-','')362 exta_variants_converted.loc[new_row,'ALT'] = extra_variants.loc[row,'con_env'][0:3].replace('-','')363 else:364 print('variant error')365 os.exit(1)366 else:367 new_row += 1368 exta_variants_converted.loc[new_row,'#CHROM'] = extra_variants.loc[row,'allele']369 exta_variants_converted.loc[new_row,'POS'] = extra_variants.loc[row,'con_pos']370 position = extra_variants.loc[row,'con_pos']371 exta_variants_converted.loc[new_row,'REF'] = extra_variants.loc[row,'gene_env'][1:3].replace('-','')372 exta_variants_converted.loc[new_row,'ALT'] = extra_variants.loc[row,'con_env'][1:3].replace('-','')373 exta_variants_converted['ID'] = '.'374 exta_variants_converted['QUAL'] = '100'375 exta_variants_converted['FILTER'] = 'PASS'376 exta_variants_converted['INFO'] = '.'377 exta_variants_converted['FORMAT'] = 'GT:GQ'378 exta_variants_converted['SAMPLE'] = '1:100'379 exta_variants_converted.to_csv(f'{self.args.output}_extra.vcf',sep='\t',index=False)380 return exta_variants_converted381 def __init__(self,args):382 header_count = 0383 self.args = args384 hap_vcf = f'{self.args.output}.vcf'385 self.header = []386 with open(hap_vcf) as fh:387 for line in fh.readlines():388 if line.startswith("##"):389 self.header.append(line)390 header_count += 1391 else:392 break393 self.variants = pd.read_csv(hap_vcf,sep='\t',header=header_count).append(self._convert_extra_varaints(),ignore_index=True).sort_values(by=['#CHROM','POS'])394 self._star_alleles()395 with open(args.gene_json) as fh:396 data = json.loads(fh.read())397 self.genes = data['genes']398 self.hybrids = data['hybrids']399 self.star_allele_genes = data['star_allele_genes']400 def _write_vcf(self,variant,counter,row,true_variants,con_allele,len_true_variants):401 with open(f'{self.args.output}_{con_allele}_gene_{counter}_{self.gene_info[con_allele][counter]["gene"]}.vcf','w') as fh:402 for line in self.header:403 if not line.startswith('##contig'):404 fh.write(line)405 if self.gene_info[con_allele][counter]["gene"] == 'hybrid':406 fh.write(f'##contig=<ID={self.genes[self.gene_info[con_allele][counter]["gene1"]]["chromosome"]},length={self.genes[self.gene_info[con_allele][counter]["gene1"]]["contig_length"]}>\n')407 fh.write(f'##gene={self.gene_info[con_allele][counter]["gene0"]}\n')408 fh.write(f'##gene={self.gene_info[con_allele][counter]["gene1"]}\n')409 else:410 fh.write(f'##contig=<ID={self.genes[self.gene_info[con_allele][counter]["gene"]]["chromosome"]},length={self.genes[self.gene_info[con_allele][counter]["gene"]]["contig_length"]}>\n')411 fh.write(f'##gene={self.gene_info[con_allele][counter]["gene"]}\n')412 fh.write('\t'.join(list(self.variants.columns))+'\n')413 for line in variant:414 fh.write('\t'.join([str(x) for x in line])+'\n')415 break_point_dict = dict()416 if self.gene_info[con_allele][counter]["gene"] == 'hybrid':417 for break_point in range(self.gene_info[con_allele][counter]['counter']):418 convert_table0 = self.gene_info[con_allele][counter][f'convert_table{break_point}']419 gene_start0 = self.genes[self.gene_info[con_allele][counter][f'gene{break_point}']]['start']420 convert_table1 = self.gene_info[con_allele][counter][f'convert_table{break_point+1}']421 gene_start1 = self.genes[self.gene_info[con_allele][counter][f'gene{break_point+1}']]['start']422 break_point_position = self.gene_info[con_allele][counter][f'breakpoint{break_point+1}']423 break_point_dict[break_point]={'gene0':self.gene_info[con_allele][counter][f'gene{break_point}'],424 'gene1':self.gene_info[con_allele][counter][f'gene{break_point+1}'],425 'position0':int(convert_table0.loc[convert_table0["con"]==break_point_position,"gene"].iloc[0]) + gene_start0 -1,426 'position1':int(convert_table1.loc[convert_table1["con"]==break_point_position,"gene"].iloc[0]) + gene_start1 -1}427 if self.variants.loc[row,"#CHROM"] not in self.gene_dict:428 self.gene_dict[con_allele] = {counter:{'gene':self.gene_info[con_allele][counter]["gene"],429 'variants': variant,430 'true_variants':true_variants,431 'len_true_variants':len_true_variants,432 'breakpoints':break_point_dict}}433 else:434 self.gene_dict[con_allele][counter] = {'gene':self.gene_info[con_allele][counter]["gene"],435 'variants': variant,436 'true_variants':true_variants,437 'len_true_variants':len_true_variants,438 'breakpoints':break_point_dict}439 def convert(self,gene_info):440 self.gene_info = gene_info441 self.gene_dict = dict()442 counter = 0443 variant = []444 true_variants = []445 len_true_variants = 0446 breakpoint_number = 0447 breakpoint_number_new = 0448 write = False449 con_allele = self.variants.loc[0,'#CHROM']450 row_allele = self.variants.loc[0,'#CHROM']451 position_num = -1452 for row in self.variants.index:453 if self.variants.loc[row,'POS'] == position_num:454 variant = variant[:-2]455 position_num = self.variants.loc[row,'POS']456 row_allele = self.variants.loc[row,'#CHROM']457 if con_allele != row_allele:458 if write:459 self._write_vcf(variant,counter,row,true_variants,con_allele,len_true_variants)460 variant = []461 true_variants = []462 len_true_variants = 0463 breakpoint_number = 0464 write = False465 counter = 0466 con_allele = row_allele467 if counter < len(self.gene_info[row_allele]):468 if self.gene_info[row_allele][counter]['stop'] < self.variants.loc[row,'POS']:469 self._write_vcf(variant,counter,row,true_variants,con_allele,len_true_variants)470 variant = []471 true_variants = []472 len_true_variants = 0473 breakpoint_number = 0474 write = False475 counter +=1476 if counter < len(self.gene_info[row_allele]):477 if self.gene_info[row_allele][counter]['position'] <= self.variants.loc[row,'POS']:478 position = self.variants.loc[row,'POS']479 if self.gene_info[row_allele][counter]["gene"] == 'hybrid':480 breakpoint_boal = False481 for breakpoint_counter in range(self.gene_info[row_allele][counter]['counter']+1):482 if self.variants.loc[row,'POS'] >= self.gene_info[row_allele][counter][f"breakpoint{breakpoint_counter}"]:483 if breakpoint_counter > breakpoint_number:484 breakpoint_boal = True485 breakpoint_number_new = breakpoint_counter486 if breakpoint_boal:487 convert_table0 = self.gene_info[row_allele][counter][f'convert_table{breakpoint_number_new-1}']488 gene_start0 = self.genes[self.gene_info[row_allele][counter][f'gene{breakpoint_number_new-1}']]['start']489 gene0 = self.gene_info[row_allele][counter][f'gene{breakpoint_number_new-1}']490 convert_table1 = self.gene_info[row_allele][counter][f'convert_table{breakpoint_number_new}']491 gene_start1 = self.genes[self.gene_info[row_allele][counter][f'gene{breakpoint_number_new}']]['start']492 gene1 = self.gene_info[row_allele][counter][f'gene{breakpoint_number_new}']493 variant.append([f'breakpoint: {int(convert_table0.loc[convert_table0["con"]==position,"gene"].iloc[0]) + gene_start0 -1} - {int(convert_table1.loc[convert_table1["con"]==position,"gene"].iloc[0]) + gene_start1 -1}\t{gene0}-{gene1}'])494 gene = self.gene_info[row_allele][counter][f'gene{breakpoint_number_new}']495 convert_table = self.gene_info[row_allele][counter][f'convert_table{breakpoint_number_new}']496 gene_start = self.genes[self.gene_info[row_allele][counter][f'gene{breakpoint_number_new}']]['start']497 chromosome = self.genes[self.gene_info[row_allele][counter][f'gene{breakpoint_number_new}']]['chromosome']498 breakpoint_number = breakpoint_number_new499 else:500 gene = self.gene_info[row_allele][counter]["gene"]501 convert_table = self.gene_info[row_allele][counter]['convert_table']502 gene_start = self.genes[self.gene_info[row_allele][counter]['gene']]['start']503 chromosome = self.genes[self.gene_info[row_allele][counter]['gene']]['chromosome']504 self.variants.loc[row,'POS'] = int(convert_table.loc[convert_table['con']==position,'gene'].iloc[0]) + gene_start -1505 if gene in self.star_allele_genes:506 if len(self.variants.loc[row,'REF']) > len(self.variants.loc[row,'ALT']):507 true_variant_position = self.variants.loc[row,'POS'] + 1508 alt_allele = '-'509 elif len(self.variants.loc[row,'REF']) < len(self.variants.loc[row,'ALT']):510 true_variant_position = self.variants.loc[row,'POS']511 alt_allele = self.variants.loc[row,'ALT'][1:]512 else:513 true_variant_position = self.variants.loc[row,'POS']514 alt_allele = self.variants.loc[row,'ALT']515 if true_variant_position in self.variant_dict:516 if alt_allele in self.variant_dict[true_variant_position]['alt']:517 len_true_variants += 1518 true_variants.extend(self.variant_dict[true_variant_position]['alt'][alt_allele])519 else:520 self.logger.info(f'allele {alt_allele} do not exists ')521 self.variants.loc[row,'#CHROM'] = chromosome522 variant.append(self.variants.loc[row])523 write = True524 else:525 if write:526 self._write_vcf(variant,counter,row,true_variants,con_allele,len_true_variants)527 variant = []528 true_variants = []529 len_true_variants = 0530 breakpoint_number = 0531 write = False532 if write:533 self._write_vcf(variant,counter,row,true_variants,con_allele,len_true_variants)534 def annotate(self):535 with open(f'{self.args.output}_haplotypes.txt','w') as fh:536 for haplotype in self.gene_dict:537 fh.write(f'{haplotype}\n')538 for gene in self.gene_dict[haplotype]:539 fh.write(f'Gene {gene} is {self.gene_dict[haplotype][gene]["gene"]} with {len(self.gene_dict[haplotype][gene]["variants"])} variants\n')540 self.logger.info(self.gene_dict[haplotype][gene]["gene"])541 if self.gene_dict[haplotype][gene]["gene"] == 'hybrid':542 gene_intron_exon_dict = dict()543 start_position = dict()544 for gene_name in self.genes.keys():545 start_position[gene_name] = -1546 gene_intron_exon_dict[gene_name] = {'exons':set(),'introns':set()}547 for break_point in self.gene_dict[haplotype][gene]["breakpoints"]:548 break_point_dict = self.gene_dict[haplotype][gene]["breakpoints"][break_point]549 exon_data = self._load_exon_data(self.genes[break_point_dict['gene0']]['transcript_name'])550 self.logger.info(break_point_dict)551 for exon in exon_data:552 if exon_data[exon]['start'] >= start_position[break_point_dict['gene0']] and exon_data[exon]['start'] <= break_point_dict['position0']:553 if exon_data[exon]['stop'] >= start_position[break_point_dict['gene0']] and exon_data[exon]['stop'] <= break_point_dict['position0']:554 gene_intron_exon_dict[break_point_dict['gene0']]['exons'].add(int(exon))555 if str(int(exon)+1) in exon_data:556 if exon_data[str(int(exon)+1)]['stop'] >= start_position[break_point_dict['gene0']] and exon_data[str(int(exon)+1)]['stop'] <= break_point_dict['position0']:557 if exon_data[exon]['start'] >= start_position[break_point_dict['gene0']] and exon_data[exon]['start'] <= break_point_dict['position0']:558 gene_intron_exon_dict[break_point_dict['gene0']]['introns'].add(int(exon)+1)559 elif exon_data[exon]['start'] >= start_position[break_point_dict['gene0']] and exon_data[exon]['start'] <= break_point_dict['position0']:560 gene_intron_exon_dict[break_point_dict['gene0']]['introns'].add(int(exon))561 start_position[break_point_dict['gene1']] = break_point_dict['position1']562 exon_data = self._load_exon_data(self.genes[break_point_dict['gene1']]['transcript_name'])563 for exon in exon_data:564 if exon_data[exon]['start'] >= start_position[break_point_dict['gene1']]:565 gene_intron_exon_dict[break_point_dict['gene1']]['exons'].add(int(exon))566 if exon_data[exon]['stop'] >= start_position[break_point_dict['gene1']]:567 gene_intron_exon_dict[break_point_dict['gene1']]['exons'].add(int(exon))568 if str(int(exon)+1) in exon_data:569 if exon_data[str(int(exon)+1)]['stop'] >= start_position[break_point_dict['gene1']]:570 gene_intron_exon_dict[break_point_dict['gene1']]['introns'].add(int(exon)+1)571 if exon_data[exon]['start'] >= start_position[break_point_dict['gene1']]:572 gene_intron_exon_dict[break_point_dict['gene1']]['introns'].add(int(exon))573 for hybrid in self.hybrids:574 boal_list = []575 for gene_name in self.genes.keys():576 for intron_exon in ['introns','exons']:577 boal_list.append(set(self.hybrids[hybrid][gene_name][intron_exon]).issubset(gene_intron_exon_dict[gene_name][intron_exon]))578 if sum(boal_list) == 4:579 fh.write(f'This hybrid is {hybrid}\n')580 self.logger.info(gene_intron_exon_dict)581 else:582 if self.gene_dict[haplotype][gene]['gene'] in self.star_allele_genes:583 true_variants = self.gene_dict[haplotype][gene]['true_variants']584 true_variants_counter = self.gene_dict[haplotype][gene]['len_true_variants']585 df=pd.DataFrame({'Number': true_variants})586 df1 = pd.DataFrame(df['Number'].value_counts())587 df1['Count']=df1['Number'].index588 fh.write(f'{df1.Number.max()} of the {true_variants_counter} star allele variants are of these haplotypes\n')589 haplotypes = list(df1[df1['Number']==df1.Number.max()]['Count'])590 for haplo in haplotypes:591 if len(self.variant_dict_star[haplo]) > df1.Number.max():592 fh.write(f'{haplo} has more variants than in sample: {len(self.variant_dict_star[haplo])}\n')593 elif len(self.variant_dict_star[haplo]) == df1.Number.max():594 fh.write(f'{haplo} is the correct haplotype\n')595 else:596 fh.write(f'{haplo} has less variants than in sample: {len(self.variant_dict_star[haplo])}\n')597def variants_from_hdf(args):598 """Entry point for variant calling from HDF5 files.599 A `LabelScheme` read from HDF must define both a `decode_variants`600 and `decode_consnesus` method. The latter is used with `join_samples`601 to detect multi-locus variants spanning `Sample` slice boundaries.602 """603 logger = medaka.common.get_named_logger('Variants')604 index = medaka.datastore.DataIndex(args.inputs)605 args.regions = index.regions606 # lookup LabelScheme stored in HDF5607 try:608 label_scheme = index.metadata['label_scheme']609 except KeyError:610 logger.debug(611 "Could not find `label_scheme` metadata in input file, "612 "assuming HaploidLabelScheme.")613 label_scheme = medaka.labels.HaploidLabelScheme()614 label_scheme.decode_variants_gene = types.MethodType(decode_variants_gene, label_scheme)615 logger.debug("Label decoding is:\n{}".format(616 '\n'.join('{}: {}'.format(k, v)617 for k, v in label_scheme._decoding.items())))618 if not hasattr(label_scheme, 'decode_variants'):619 raise AttributeError(620 '{} does not support decoding of variants'.format(label_scheme))621 if not hasattr(label_scheme, 'decode_consensus'):622 raise AttributeError(623 '{} does not support consensus decoding required '624 'for variant calling.'.format(label_scheme))625 # tell label_scheme whether we want verbose info fields626 label_scheme.verbose = args.verbose627 meta_info = label_scheme.variant_metainfo628 with pysam.FastaFile(args.ref_fasta) as fa:629 lengths = dict(zip(fa.references, fa.lengths))630 all_variants, gene_info = find_variants(args.sample_in,args.ref_fasta,args.output)631 other_variants = []632 with medaka.vcf.VCFWriter(633 f'{args.output}.vcf', 'w', version='4.1',634 contigs=['{},length={}'.format(r.ref_name, lengths[r.ref_name])635 for r in args.regions],636 meta_info=meta_info) as vcf_writer:637 for reg in args.regions:638 gene_variants = all_variants[reg.ref_name]639 logger.info("Processing {}.".format(reg))640 ref_seq = pysam.FastaFile(args.ref_fasta).fetch(641 reference=reg.ref_name).upper()642 samples = index.yield_from_feature_files([reg])643 trimmed_samples = medaka.common.Sample.trim_samples(samples)644 joined_samples = medaka.variant.join_samples(645 trimmed_samples, ref_seq, label_scheme)646 for sample in joined_samples:647 variants,other_variant = label_scheme.decode_variants_gene(648 sample, ref_seq, gene_variants, reg.ref_name, ambig_ref=args.ambig_ref,649 return_all=args.gvcf)650 vcf_writer.write_variants(variants, sort=True)651 other_variants.extend(other_variant)652 with open(f'{args.output}_extra.txt','w') as fh:653 fh.write('allele\tcon_pos\tgene_pos\tcon_allele\tgene_allele\tcon_env\tgene_env')654 for other_variant in other_variants:655 fh.write('\n')656 fh.write('\t'.join([str(x) for x in other_variant]))657 convert_object = ConvertCoordinates(args)658 convert_object.convert(gene_info)659 convert_object.annotate()660if __name__ == "__main__":661 parser = argparse.ArgumentParser()662 parser.add_argument('ref_fasta', help='Reference sequence .fasta file.')663 parser.add_argument('sample_in', help='Mapped genes to reference sequence .bam file.')664 parser.add_argument('inputs', help='Consensus .hdf files.')...

Full Screen

Full Screen

test_form_params.py

Source:test_form_params.py Github

copy

Full Screen

...85 bigform_data = form_with_radio + form_select_misc86 form = create_form_params_helper(bigform_data)87 orig_items = form.items()88 # Generate the variants89 variants = [v for v in form.get_variants(mode=MODE_TMB)]90 self.assertEqual(orig_items, form.items())91 def test_tmb_variants(self):92 # 'top-middle-bottom' mode variants93 def filter_tmb(values):94 if len(values) > 3:95 values = (values[0],96 values[len(values) / 2],97 values[-1])98 return values99 bigform_data = form_with_radio + form_select_misc100 clean_data = get_grouped_data(bigform_data)101 new_bigform = create_form_params_helper(bigform_data)102 total_variants = 2 * 2 * 3103 variants_set = set()104 variants = [v for v in new_bigform.get_variants(mode=MODE_TMB)]105 for i, form_variant in enumerate(variants):106 # First element must be the created `new_bigform`107 if i == 0:108 self.assertIs(new_bigform, form_variant)109 continue110 for name, values in clean_data.items():111 tmb_values = filter_tmb(values)112 self.assertIn(form_variant[name][0], tmb_values)113 variants_set.add(repr(form_variant))114 # Ensure we actually got the expected number of variants115 f = FormParameters()116 self.assertEquals(len(variants), total_variants + 1)117 # Variants shouldn't appear duplicated118 self.assertEquals(len(variants_set), total_variants)119 def test_tmb_variants_large(self):120 """121 Note that this test has several changes from test_tmb_variants:122 * It uses form_select_misc_large, which exceeds the form's123 TOP_VARIANTS = 15124 * Doesn't use filter_tmb since variants are based on a "random pick"125 """126 bigform_data = (form_with_radio +127 form_select_cars +128 form_select_misc_large)129 clean_data = get_grouped_data(bigform_data)130 new_bigform = create_form_params_helper(bigform_data)131 # total_variants = 2 * 3 * 3 * 3132 variants_set = set()133 variants = [v for v in new_bigform.get_variants(mode=MODE_TMB)]134 # Please note that this depends completely in form.SEED AND135 # form.TOP_VARIANTS136 RANDOM_PICKS = {1: ('volvo', 'black', 'd', 'female'),137 2: ('volvo', 'blue', 'i', 'male'),138 3: ('volvo', 'blue', 'f', 'female'),139 4: ('volvo', 'black', 'g', 'female'),140 5: ('volvo', 'black', 'm', 'male'),141 6: ('volvo', 'black', 'l', 'male'),142 7: ('volvo', 'blue', 'b', 'female'),143 8: ('volvo', 'blue', 'e', 'female'),144 9: ('volvo', 'black', 'c', 'male'),145 10: ('volvo', 'black', 'a', 'female'),146 11: ('volvo', 'blue', 'e', 'male'),147 12: ('volvo', 'black', 'j', 'male'),148 13: ('volvo', 'blue', 'c', 'male'),149 14: ('volvo', 'black', 'a', 'male'),150 15: ('volvo', 'black', 'i', 'female')}151 for i, form_variant in enumerate(variants):152 # First element must be the created `new_bigform`153 if i == 0:154 self.assertIs(new_bigform, form_variant)155 continue156 option = []157 for name, values in clean_data.items():158 form_value = form_variant[name][0]159 option.append(form_value)160 current_option = RANDOM_PICKS[i]161 self.assertEqual(tuple(option), current_option)162 variants_set.add(repr(form_variant))163 # Ensure we actually got the expected number of variants164 f = FormParameters()165 self.assertEquals(len(variants), f.TOP_VARIANTS + 1)166 # Variants shouldn't appear duplicated167 self.assertEquals(len(variants_set), f.TOP_VARIANTS)168 def test_all_variants(self):169 # 'all' mode variants170 bigform_data = form_with_radio + form_select_misc171 clean_data = get_grouped_data(bigform_data)172 new_bigform = create_form_params_helper(bigform_data)173 total_variants = 2 * 5 * 10174 variants_set = set()175 for i, form_variant in enumerate(new_bigform.get_variants(mode=MODE_ALL)):176 # First element must be the created `new_bigform`177 if i == 0:178 self.assertIs(new_bigform, form_variant)179 continue180 for name, all_values in clean_data.items():181 self.assertIn(form_variant[name][0], all_values)182 variants_set.add(repr(form_variant))183 # Ensure we actually got the expected number of variants184 f = FormParameters()185 expected = min(total_variants, f.TOP_VARIANTS)186 self.assertEquals(expected, i)187 # Variants shouldn't duplicated188 self.assertEquals(expected, len(variants_set))189 def test_t_b_variants(self):190 # 'top' and 'bottom' variants191 bigform_data = form_with_radio + form_select_cars + form_select_misc192 clean_data = get_grouped_data(bigform_data)193 new_bigform = create_form_params_helper(bigform_data)194 total_variants = 1195 # 'top' mode variants196 t_form_variants = [fv for fv in new_bigform.get_variants(mode=MODE_T)][1:]197 # Ensure we actually got the expected number of variants198 self.assertEquals(total_variants, len(t_form_variants))199 for name, values in clean_data.items():200 self.assertEquals(values[0], t_form_variants[0][name][0])201 # 'bottom' mode variants202 t_form_variants = [fv for fv in new_bigform.get_variants(mode=MODE_B)][1:]203 # Ensure we actually got the expected number of variants204 self.assertEquals(total_variants, len(t_form_variants))205 for name, values in clean_data.items():206 self.assertEquals(values[-1], t_form_variants[0][name][0])207 def test_max_variants(self):208 # Combinatoric explosion (mode=MODE_ALL): total_variants = 2*5*5*5 =209 # 250 > dc.Form.TOP_VARIANTS = 150210 new_form = create_form_params_helper(form_with_radio +211 form_select_cars +212 form_select_misc)213 self.assertEquals(FormParameters.TOP_VARIANTS,214 len([fv for fv in new_form.get_variants(mode=MODE_ALL)]) - 1)215 def test_max_variants_many_fields(self):216 # Makes sure that the get_variants will return 15 even when we have tons217 # of parameters218 form_params = []219 for i in xrange(50):220 form_params.append({'type': 'select',221 'name': 'cars_%s' % i,222 'values': ('volvo_%s' % i,223 'saab_%s' % i,224 'jeep_%s' % i,225 'chevy_%s' % i,226 'fiat_%s' % i,)})227 # Really large form228 new_form = create_form_params_helper(form_params)229 start_time = time.time()230 self.assertEquals(FormParameters.TOP_VARIANTS,231 len([fv for fv in new_form.get_variants(mode=MODE_TMB)]) - 1)232 # With the previous version of our code this took considerable time, because range(10 ** 9)233 # was called (doh!) creating 10 ** 9 integer objects in memory, which took a lot of time234 # to complete (20 sec) and consumed many GB of RAM.235 #236 # Since measuring memory usage for a test is hard, I'm measuring spent time to make sure237 # we don't have this regression anymore.238 spent_time = time.time() - start_time239 self.assertGreaterEqual(1, spent_time)240 def test_same_variants_generation(self):241 # Combinatoric explosion (mode=MODE_ALL): total_variants = 250 > 150242 #243 # Therefore will be used random variants generation. We should get the244 # same every time we call `form.get_variants`245 new_form = create_form_params_helper(form_with_radio +246 form_select_cars +247 form_select_misc)248 get_all_variants = lambda: set(repr(fv) for fv in249 new_form.get_variants(mode=MODE_ALL))250 variants = get_all_variants()251 for i in xrange(10):252 self.assertEquals(variants, get_all_variants())253 def test_empty_select_all(self):254 """255 This tests for handling of "select" tags that have no options inside.256 The get_variants method should return a variant with the select tag name257 that is always an empty string.258 In this case I'm going to call get_variants with mode=MODE_ALL259 """260 new_form = create_form_params_helper(form_with_radio +261 form_select_cars +262 form_select_misc +263 form_select_empty)264 [i for i in new_form.get_variants(mode=MODE_ALL)]265 def test_empty_select_tb(self):266 """267 This tests for handling of "select" tags that have no options inside.268 The get_variants method should return a variant with the select tag name269 that is always an empty string.270 In this case I'm going to call get_variants with mode=MODE_TB271 This is the case reported by Taras at https://sourceforge.net/apps/trac/w3af/ticket/171015272 """273 new_form = create_form_params_helper(form_with_radio +274 form_select_cars +275 form_select_misc +276 form_select_empty)277 [i for i in new_form.get_variants(mode=MODE_TB)]278 def test_form_params_deepish_copy(self):279 form = create_form_params_helper(form_with_radio + form_with_checkbox)280 copy = form.deepish_copy()281 self.assertEqual(form.items(), copy.items())282 self.assertEqual(form._method, copy._method)283 self.assertEqual(form._action, copy._action)284 self.assertIsNot(form, copy)285 self.assertEquals(copy.get_parameter_type('sex'), INPUT_TYPE_RADIO)286 def test_form_params_deep_copy(self):287 form = create_form_params_helper(form_with_radio + form_with_checkbox)288 form_copy = copy.deepcopy(form)289 self.assertEqual(form.items(), form_copy.items())290 self.assertEqual(form._method, form_copy._method)291 self.assertEqual(form._action, form_copy._action)...

Full Screen

Full Screen

variant_db.py

Source:variant_db.py Github

copy

Full Screen

...79 See the notes on PARAMS_MAX_VARIANTS and PATH_MAX_VARIANTS above. Also80 understand that we'll keep "dirty" versions of the references/fuzzable81 requests in order to be able to answer "False" to a call for82 need_more_variants in a situation like this:83 >> need_more_variants('http://foo.com/abc?id=32')84 True85 >> append('http://foo.com/abc?id=32')86 True87 >> need_more_variants('http://foo.com/abc?id=32')88 False89 """90 HASH_IGNORE_HEADERS = ('referer',)91 TAG = '[variant_db]'92 MAX_IN_MEMORY = 5093 def __init__(self):94 self._variants = CachedDiskDict(max_in_memory=self.MAX_IN_MEMORY,95 table_prefix='variant_db')96 self._variants_eq = ScalableBloomFilter()97 self._variants_form = CachedDiskDict(max_in_memory=self.MAX_IN_MEMORY,98 table_prefix='variant_db_form')99 self.params_max_variants = cf.cf.get('params_max_variants')100 self.path_max_variants = cf.cf.get('path_max_variants')101 self.max_equal_form_variants = cf.cf.get('max_equal_form_variants')...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run avocado automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful