How to use match_fs method in autotest

Best Python code snippet using autotest_python

music_score_parser - 副本.py

Source:music_score_parser - 副本.py Github

copy

Full Screen

1'''2谱 = 小节*3小节 = 拍+ '|'4拍 = 连音+5连音 = 音高[时长]6音高 = [变化]音符[八度偏移]7时长 = 时值倍数记号+8变化 = '#' | 'b' | 'X' | 'bb' | '&'9音符 = [1-7-]10八度偏移 = 八度升偏移 | 八度降偏移11八度升偏移 = 八度升+12八度升 = '^'13八度降偏移 = 八度降+14八度降 = 'v'15'''16rex_language = '''17rex_language = line*18line = [idDefine] [comment_str] [NEWLINE]19idDefine = id '=' rex20basic_rex = plain_str | rex_str | id | group | multi_optional21multi = basic_rex | multi_bound | multi_star | multi_plus 22seq = multi+23choices = seq ('|' seq)*24rex = choices25multi_bound = basic_rex '{' num ',' [num] '}'26multi_star = basic_rex '*'27multi_plus = basic_rex '+'28multi_optional = '[' rex ']'29group = '(' rex ')'30str = rrex"'([^'\\]|\\.)*'" | rrex'"([^"\\]|\\.)*"'31plain_str = rrex'[r](?=[\'"])' str32rex_str = rrex'[r]rex(?=[\'"])' str33comment_str = rrex'#.*'34id = rrex'\b(?=[^0-9])\w+(?=[^\'"])'35num = rrex'\b[1-9])\d*(?=[^\'"])'36NEWLINE = '\n'37BEGIN = rrex'^'38END = rrex'$'39# _NOSPACES_ predefined40'''41class RexParser:42 def __init__(self):43 id = MatchForRexParserToken('id')44 num = MatchForRexParserToken('num')45 plain_str = MatchForRexParserToken('plain_str')46 rex_str = MatchForRexParserToken('rex_str')47 comment_str = MatchForRexParserToken('comment_str')48 assign = MatchForRexParserToken('spiecial_char', data = '=')49 left_small_bracket = MatchForRexParserToken('spiecial_char', data = '(')50 right_small_bracket = MatchForRexParserToken('spiecial_char', data = ')')51 left_big_bracket = MatchForRexParserToken('spiecial_char', data = '{')52 comma = MatchForRexParserToken('spiecial_char', data = ',')53 right_big_bracket = MatchForRexParserToken('spiecial_char', data = '}')54 star = MatchForRexParserToken('spiecial_char', data = '*')55 plus = MatchForRexParserToken('spiecial_char', data = '+')56 left_mid_bracket = MatchForRexParserToken('spiecial_char', data = '[')57 right_mid_bracket = MatchForRexParserToken('spiecial_char', data = ']')58 choice_char = MatchForRexParserToken('spiecial_char', data = '|')59 rex = MatchEq('rex', None)60 idDefine = MatchSequence('idDefine', [id, assign, rex])61 line = MatchSequence('line', [MatchOptional('_', idDefine), MatchOptional('_', comment_str)])62 multi_optional = MatchSequence('multi_optional', [left_mid_bracket, rex, right_mid_bracket])63 group = MatchSequence('group', [left_small_bracket, rex, right_small_bracket])64 basic_rex = MatchChoices('basic_rex', [plain_str, rex_str, id, group, multi_optional])65 multi_bound = MatchSequence('multi_bound', \66 [basic_rex, left_big_bracket, num, comma, \67 MatchOptional('_', num), right_big_bracket])68 multi_star = MatchSequence('multi_star', [basic_rex, star])69 multi_plus = MatchSequence('multi_plus', [basic_rex, plus]) 70 multi = MatchChoices('multi', [basic_rex, multi_bound, multi_star, multi_plus,])71 72 seq = MatchPlus('seq', multi)73 choices = MatchSequence('choices', \74 [seq, MatchStar('_', MatchSequence('_', [choice_char, seq]))])75 rex.__init__('rex', choices)76 ls = list(locals().keys())77 #ls.sort()78 #self.match_fs = {name : locals()[name] for name in ls}79 match_fs = {}80 for name in ls:81 match_fs[name] = locals()[name]82 self.match_fs = match_fs83 84 return85 def idDefine2init_assign(self, idDefineResult):86 assert idDefineResult.name == 'idDefine'87 assert len(idDefineResult.data) == 388 id = idDefineResult.data[0]89 assert id.name == 'token'90 id = id.data91 assert type(id) == str92 rexResult = idDefineResult.data[-1]93 assignRightHand = self.rexResult2MatchPatternConstruct(rexResult)94 95 choicesResult = rexResult.data96 97 98 def parser(self, string):99 tk = RexTokener()100 ls = []101 match = self.match_fs['line']102 for i, line in enumerate(tk.tokenize(string)):103 r = match(line, 0, len(line))104 if r == None:105 raise Exception('parser fail at {}th line'.format(i))106 ls.append(r)107 for 108 return ls109 110 111class RexTokener:112 id = 'id'113 num = 'num'114 plain_str = 'plain_str'115 rex_str = 'rex_str'116 comment_str = 'comment_str'117 spaces = 'spaces'118 spiecial_char = 'spiecial_char'119 120 def __init__(self):121 self.special_chars = '#=(){,}*+[]|"\''122 self.plain_str_prefixes = ['r"', "r'", '"', "'"]123 self.rex_str_prefixes = ['rex"', 'rrex"', "rex'", "rrex'"]124 self.str_prefixes = self.plain_str_prefixes + self.rex_str_prefixes125 return126 127 def isSpecialChar(self, c):128 return c in self.special_chars129 def getStrPrefixes(self):130 return self.str_prefixes131 132 def matchID(self, string, start, end):133 if start != 0:134 c = string[start-1]135 if not (c.isspace() or self.isSpecialChar(c)):136 return None137 138 if start == end:139 return None140 c = string[start]141 if c.isspace() or self.isSpecialChar(c) or c.isdigit():142 return None143 144 for pre in self.getStrPrefixes():145 if string.startswith(pre, start, end):146 return None147 148 for i in range(start, end):149 c = string[i]150 if c.isspace() or self.isSpecialChar(c):151 if c in '\'"':152 return None153 break154 else:155 i = end156 if not 0 <= start < i <= end:157 print(start, i, end)158 assert 0 <= start < i <= end159 return MatchResult(string, start, i, type=self.id, data = string[start:i])160 def matchNum(self, string, start, end):161 if start != 0:162 c = string[start-1]163 if not (c.isspace() or self.isSpecialChar(c)):164 return None165 166 if start == end or not string[start].isdigit():167 return None168 169 170 for i in range(start, end):171 c = string[i]172 if not c.isdigit():173 if c.isspace() or self.isSpecialChar(c):174 break175 return None176 else:177 i += 1178 assert 0 <= start < i <= end179 num = int(string[start:i])180 return MatchResult(string, start, i, type=self.num, data = num)181 182 def matchStr(self, string, start, end):183 if start != 0:184 c = string[start-1]185 if not (c.isspace() or self.isSpecialChar(c)):186 return None187 for pre in self.getStrPrefixes():188 if string.startswith(pre, start, end):189 break190 else:191 return None192 close_char = pre[-1]193 name = self.rex_str if len(pre) >= 4 else self.plain_str194 is_raw_str = len(pre) in [2, 5]195 196 old_start = start197 str_start = start + len(pre)-1198 escaping = False199 for i in range(str_start+1, end):200 c = string[i]201 if c == '\\':202 escaping = not escaping203 elif c == close_char and not escaping:204 break205 else:206 return None207 i += 1208 str_end = i209 assert 0 <= str_start < str_end-1 < str_end <= end210 s = string[str_start : str_end]211 if is_raw_str:212 s = 'r' + s213 s = eval(s)214 return MatchResult(string, old_start, str_end, type=name, data = s)215 def matchSpecialCharExceptCommentChar(self, string, start, end):216 if start == end or string[start] == '#':217 return None218 219 c = string[start]220 if not self.isSpecialChar(c):221 return None222 223 return MatchResult(string, start, start+1, type=self.spiecial_char, data = c)224 225 def matchComment(self, string, start, end):226 if start == end or not string[start] == '#':227 return None228 229 return MatchResult(string, start, end, type=self.comment_str, data = string[start : end])230 231 def matchSpaces(self, string, start, end):232 if start == end or not string[start].isspace():233 return None234 235 for i in range(start, end):236 if not string[i].isspace():237 break238 else:239 i += 1240 return MatchResult(string, start, i, type=self.spaces, data = None)241 def matchOneToken(self, string, start, end):242 for f in [self.matchID, self.matchNum, self.matchStr, \243 self.matchSpecialCharExceptCommentChar, \244 self.matchComment, self.matchSpaces]:245 r = f(string, start, end)246 if r != None:247 return r248 return None249 250 def tokenize(self, string):251 for line in string.split('\n'):252 yield self.tokenizeLine(line)253 254 def tokenizeLine(self, line):255 line = line.strip()256 257 ls = []258 start = 0259 end = len(line)260 while True:261 r = self.matchOneToken(line, start, end)262 if r == None:263 if start != end:264 raise ValueError('tokenizeLine fail')265 break266 if r.start == r.end:267 raise Exception('LogicError')268 if r.type == self.spaces or r.type == self.comment_str:269 # cast away270 pass271 else:272 ls.append(r)273 start = r.end274 return ls275 276 277 278import re279class MatchResult:280 def __init__(self, string, start, end, type, data):281 assert 0 <= start <= end <= len(string)282 283 self.string = string284 self.start = start285 self.end = end286 self.type = type287 self.data = data288 return289 290 def length(self):291 return self.end - self.start292 def __repr__(self):293 tpl = 'MatchResult(string={string}, start={start}, end={end}, type={type}, data={data})'294 return tpl.format(string=self.string, \295 start=self.start, end=self.end, \296 type=self.type, data=self.data)297def match_str(prefix, string, start, end):298 if not string.startswith(prefix, start, end):299 return None300 end = start + len(prefix)301 return MatchResult(string, start, end, type=match_str, data=prefix)302def match_rex(rex, string, start, end):303 m = re.search(rex, string[start : end])304 if not m:305 return None306 307 if not string.startswith(prefix, start, end):308 return None309 end = start + len(prefix)310 return MatchResult(string, m.start(), m.end(), type=match_rex, data=rex)311class MatchPattern:312 @staticmethod313 def default_factory():314 return MatchPattern('')315 316 def __init__(self, name):317 self.name = name318 return319 def match(self, string, start, end):320 r = self._match(string, start, end)321 if r != None:322 r.type = type(self)323 return r324 def __call__(self, string, start, end):325 return self.match(string, start, end)326 def __repr__(self):327 return '<{type}(name={name})>'.format(type=type(self), name=self.name)328class MatchEq(MatchPattern):329 @staticmethod330 def default_factory():331 return MatchPattern('', '')332 333 def __init__(self, name, matchPattern):334 super().__init__(name)335 self.matchPattern = matchPattern336 return337 def _match(self, string, start, end):338 return eq_match(self.matchPattern, string, start, end)339 340class MatchString(MatchPattern):341 @staticmethod342 def default_factory(self):343 return MatchPattern('', '')344 345 def __init__(self, name, string):346 super().__init__(name)347 self.string = string348 return349 def _match(self, string, start, end):350 return match_str(self.string, string, start, end)351class MatchRex(MatchPattern):352 @staticmethod353 def default_factory():354 return MatchPattern('', '')355 356 def __init__(self, name, rex):357 super().__init__(name)358 self.rex = rex359 return360 def _match(self, string, start, end):361 return match_rex(self.rex, string, start, end)362class MatchMulti(MatchPattern):363 @staticmethod364 def default_factory():365 return MatchPattern('', '', 0)366 367 def __init__(self, name, matchPattern, min, max=float('inf')):368 super().__init__(name)369 370 assert 0 <= min < min+1 <= max371 assert isinstance(min, int)372 373 self.matchPattern = matchPattern374 self.min = min375 self.max = max376 return377 def _match(self, string, start, end):378 return repeat_match(self.matchPattern, \379 string, start, end, self.min, self.max)380 381class MatchSequence(MatchPattern):382 @staticmethod383 def default_factory():384 return MatchPattern('', [])385 386 def __init__(self, name, matchPatternList):387 super().__init__(name)388 self.matchPatternList = matchPatternList389 #self.match_fs = [p.match for p in matchPatternList]390 return391 def _match(self, string, start, end):392 return sequence_match(self.matchPatternList, string, start, end)393 394 395class MatchChoices(MatchPattern):396 @staticmethod397 def default_factory():398 return MatchPattern('', [])399 400 def __init__(self, name, matchPatternList):401 super().__init__(name)402 self.matchPatternList = matchPatternList403 #self.match_fs = [p.match for p in matchPatternList]404 return405 def _match(self, string, start, end):406 return choose_match(self.matchPatternList, string, start, end)407 408class MatchOptional(MatchMulti):409 @staticmethod410 def default_factory():411 return MatchPattern('', [])412 413 def __init__(self, name, matchPattern):414 super().__init__(name, matchPattern, min=0, max=1)415 return416 417class MatchStar(MatchMulti):418 @staticmethod419 def default_factory():420 return MatchPattern('', [])421 422 def __init__(self, name, matchPattern):423 super().__init__(name, matchPattern, min=0)424 return425 426class MatchPlus(MatchMulti):427 @staticmethod428 def default_factory():429 return MatchPattern('', [])430 431 def __init__(self, name, matchPattern):432 super().__init__(name, matchPattern, min=1)433 return434class MatchForRexParserToken(MatchPattern):435 @staticmethod436 def default_factory():437 return MatchPattern('', [])438 439 def __init__(self, type, data=None):440 super().__init__('token')441 self.type = type442 self.data = data443 return444 445 def _match(self, tokens, start, end):446 if start == end:447 return None448 r = tokens[start]449 if r.type != self.type:450 return None451 if self.data != None and self.data != r.data:452 return None453 return MatchResult(tokens, start, start+1, type=None, data = r)454 455def match(match_f, string, start, end):456 if not isinstance(match_f, MatchPattern):457 print(type(match_f), match_f)458 assert isinstance(match_f, MatchPattern)459 return match_f(string, start, end)460def eq_match(match_f, string, start, end):461 r = match(match_f, string, start, end)462 if r == None:463 return None464 return MatchResult(r.string, r.start, r.end, type=eq_match, data=r)465 466def repeat_match(match_f, string, start, end, min, max=float('inf')):467 assert min < max468 469 old_start = start470 ls = []471 for _ in range(min):472 r = match(match_f, string, start, end)473 if r == None:474 return None475 start = r.end476 ls.append(r)477 while len(ls) <= max:478 r = match(match_f, string, start, end)479 if r == None:480 break481 if max == float('inf') and r.start == r.end:482 raise MemoryError('match empty string infinit times')483 484 start = r.end485 ls.append(r)486 487 return MatchResult(string, old_start, start, type=repeat_match, data=ls)488def sequence_match(match_fs, string, start, end):489 assert len(match_fs)490 old_start = start491 ls = []492 for match_f in match_fs:493 r = match(match_f, string, start, end)494 if r == None:495 return None496 start = r.end497 ls.append(r)498 499 500 return MatchResult(string, old_start, start, type=sequence_match, data=ls)501def choose_match(match_fs, string, start, end):502 assert len(match_fs)503 504 ls = []505 for match_f in match_fs:506 r = match(match_f, string, start, end)507 if r == None:508 continue509 ls.append(r)510 511 if not ls:512 return None513 r = max(ls, key=lambda r:r.length())514 515 return MatchResult(r.string, r.start, r.end, type=choose_match, data=r)516'''517def star_match(match_f, string, start, end):518 return repeat_match(match_f, string, start, end, min=0)519def plus_match(match_f, string, start, end):520 return repeat_match(match_f, string, start, end, min=1)521def optional_match(match_f, string, start, end):522 return repeat_match(match_f, string, start, end, min=0, max=1)523def choose_str(strs, string, start, end):524 assert len(strs)525 assert strs == sorted(strs)526 for prefix in reversed(strs):527 r = match_str(prefix, string, start, end)528 if r != None:529 break530 else:531 return None532 return MatchResult(r.string, r.start, r.end, type=choose_str, data=r)533'''534p = RexParser()535r = p.parser(rex_language)...

Full Screen

Full Screen

article_cleaner.py

Source:article_cleaner.py Github

copy

Full Screen

1import re2import pandas as pd3import numpy as np4import time56amino_acid_dict = {'C' : 'CYS', 'D' : 'ASP', 'S' : 'SER', 'Q' : 'GLN', 'K' : 'LYS',7 'I' : 'ILE', 'P' : 'PRO', 'T' : 'THR', 'F' : 'PHE', 'N' : 'ASN', 8 'G' : 'GLY', 'H' : 'HIS', 'L' : 'LEU', 'R' : 'ARG', 'W' : 'TRP', 9 'A' : 'ALA', 'V' : 'VAL', 'E' : 'GLU', 'Y' : 'TYR', 'M' : 'MET'}1011def amino_three(amino):12 """Return three letter amino acid from one letter"""13 return amino_acid_dict[amino]14 15def decompose_variation(variation):16 """Decompose a variation to search the three letter amino acid or use the position in regex"""1718 decompose_aa_pos_aa = re.compile("([a-z]{1,})(\d+)([a-z]{1,})")19 list_variation = decompose_aa_pos_aa.search(variation)20 if list_variation:21 aa1 = list_variation.group(1)22 aa2 = list_variation.group(3)2324 if len(aa1) + len(aa2) == 2:25 amino1 = amino_acid_dict[aa1.upper()].lower()26 position = list_variation.group(2)27 amino2 = amino_acid_dict[aa2.upper()].lower()28 return [amino1,position,amino2]29 return False3031def decompose_fusion(variation):32 """Decompose a "fusion" variation to search separately the genes fusioned"""33 decompose_g1_g2_fusion = re.compile("(\w+)\s?(\?|-)\s?(\w+)\? fusion")34 list_variation = decompose_g1_g2_fusion.search(variation)35 if list_variation:36 gene1 = list_variation.group(1)37 gene2 = list_variation.group(3)38 return "(" + gene1 + "|" + gene2 + ")"39 return False40 41 42def decompose_dup(variation):43 """Decompose a "duplication" variation"""4445 decompose_mut_pos = re.compile("([a-z]{1,})(\d+)dup")46 list_mut = decompose_mut_pos.search(variation)47 if list_mut:48 mut = list_mut.group(1)49 pos = list_mut.group(2)50 return mut + "" + pos51 return False525354def clean_text(article):55 """Clean text from extra dot ("Fig.", "...", "ie.") to enable regex to select real sentences"""56 dot3 = re.compile("[\.]{2,}")57 fig = re.compile("fig[s]?\.")58 decimal = re.compile("\d+\.\d+") 59 etal = re.compile("et al\.")60 ie = re.compile("i\.e\.")61 inc = re.compile("inc\.")62 mutation_point = re.compile("[p|c]\.")63 64 clean_article = article.lower()65 clean_article = dot3.sub(".", clean_article)66 clean_article = fig.sub("", clean_article)67 clean_article = decimal.sub("", clean_article)68 clean_article = etal.sub("", clean_article)69 clean_article = ie.sub("", clean_article) 70 clean_article = inc.sub("", clean_article) 71 clean_article = mutation_point.sub("", clean_article) 72 73 74 return clean_article7576def join_tuple_string(strings_tuple):77 return ' '.join(strings_tuple)7879def find_match(text, word):80 """Find the actual match with a regex"""81 clean = clean_text(text)82 word = word.lower()83 target_sentence = "([^.]*{}[^.]*\.)".format(word)84 before_after_target = "([^.]*\.){0,1}" 85 match_exp = re.compile(before_after_target + target_sentence + before_after_target)86 match_text = match_exp.findall(clean)87 final_match = "".join(list(map(join_tuple_string, match_text)))88 return final_match899091def extract_match(line):92 """Construction of the regex to use according to the variation"""93 94 # Cleaning text --> Already clean, no need 95 text = line["Text"]96 variation = line["Variation"]97 gene = line["Gene"].lower()98 99 if len(text) < 10000:100 return text,6101 102 if "r1627" == variation:103 return find_match(text, "162[0-9]"), 4104 if "c1385" == variation:105 return find_match(text, "p300"), 4106 107 if "hypermethylation" in variation:108 match_meth = find_match(text, "methylat")109 if len(match_meth) != 0:110 return match_meth, 2111 112 if "casp" in variation:113 match_casp = find_match(text, "casp")114 if len(match_casp) != 0:115 return match_casp, 1116 # Splice117 if "splice" in variation:118 match_splice = find_match(text, "splice")119 if len(match_splice) != 0:120 return match_splice, 2121 122 if "fs" in variation:123 match_fs = find_match(text, "frameshift")124 if len(match_fs) != 0:125 return match_fs, 2126 127 # Amplification128 if 'ampli' in variation:129 match_ampli = find_match(text, "(amplif|increse)")130 if len(match_ampli) != 0:131 return match_ampli,3132 133 # Duplication134 if "dup" in variation:135 decomp_dup = decompose_dup(variation)136 if decomp_dup:137 match_mut_pos = find_match(text, decomp_dup)138 if len(match_mut_pos) != 0:139 return match_mut_pos,2140 141 match_dup = find_match(text, "dup")142 if len(match_dup) != 0:143 return match_dup, 3144 145 # Try with * --> w802*146 if "*" in variation:147 new_var = variation.replace("*", "\\*")148 match_star = find_match(text, new_var)149 if len(match_star) != 0:150 return match_star, 1151 if "fs" in variation:152 match_fs = find_match(text, "fs\\*")153 if len(match_fs) != 0:154 return match_fs, 3155 156 match_stop_nonsense = find_match(text,"(stop|nonsense)")157 if len(match_stop_nonsense) != 0:158 return match_stop_nonsense, 2159160 # Try first match with inital variation value161 # Quality score = 1 162 initial_match = find_match(text, variation) 163 if len(initial_match) != 0:164 #print("First match ! ", variation)165 return initial_match, 1166 167 168 # deletion and insertion169 if "del" in variation or "ins" in variation:170 match_delins = find_match(text, "(deletion|insertion|delet|insert)")171 if len(match_delins) != 0:172 return match_delins,2173 174 match_delins_sentence = find_match(text, "(del|ins)(\w|\s){0,}(del|ins)")175 if len(match_delins_sentence) != 0:176 return match_delins_sentence, 3177 178 # Trunc mutations179 if "trunc" in variation:180 match_trunc = find_match(text, "trunc")181 if len(match_trunc) != 0:182 #print("Trunc", variation)183 return match_trunc, 2184 185 match_shorte = find_match(text, "(shorte|delet)")186 if len(match_shorte) != 0:187 return match_shorte,4188189190 # Fusion of two genes191 # Quality score = 2192 if "fusion" in variation:193 fusion_gene = decompose_fusion(variation)194 if fusion_gene:195 match_fusion_gene = find_match(text, fusion_gene)196 if len(match_fusion_gene) != 0:197 #print("Fusion gene1 | gene2", variation)198 return match_fusion_gene, 2199200 # Try to match the word fusion at least..201 # Quality score 4 (bad)202 match_fusion = find_match(text,"fusion")203 if len(match_fusion) != 0:204 #print("FUSION", variation)205 return match_fusion, 4206 207 aa_pos_aa = decompose_variation(variation)208 if aa_pos_aa :209 if len(aa_pos_aa) == 3:210 # If we success to split variation in 3 group --> aa1 pos aa2211 # Second try without the last amino acid --> y371212 match_variation_aa_pos = find_match(text, variation[:-1])213 if len(match_variation_aa_pos) != 0:214 #print("y371 aa_pos", variation)215 216 return match_variation_aa_pos, 2217218 # Third try with 3 letter code of amino acid --> tyr371ser219 match_aa_pos_aa = find_match(text, "".join(aa_pos_aa))220 if len(match_aa_pos_aa) != 0:221 #print("aa_pos_aa", variation)222 return match_aa_pos_aa, 1223 224 # Try with 3 letter code without the last aa --> tyr371225 match_aa_pos = find_match(text, aa_pos_aa[0] + aa_pos_aa[1])226 if len(match_aa_pos) != 0:227 #print("aa_pos", variation)228 229 return match_aa_pos,2230 231 # Match position only --> 371232 match_pos = find_match(text, aa_pos_aa[1])233 if len(match_pos) != 0:234 #print("pos", variation)235 236 return match_pos,4237 # Search word Substitution238 match_substitution = find_match(text, "substitu")239 if len(match_substitution) != 0:240 return match_substitution, 3241 242 # Match position around the real position --> 370 - 379243 match_pos_weak = find_match(text, aa_pos_aa[1][:-1] + "[0-9]")244 if len(match_pos_weak) != 0:245 #print("pos weak", variation)246 247 return match_pos_weak,5248 249 match_gene = find_match(text, gene)250 if len(match_gene) != 0:251 return match_gene, 5252 253 # score 6 ?254 return text,7255256257258def prepare_datas(file_text, file_variant, file_out, is_training):259 print("____________________________Cleaning Datas__________________________")260 print("____________________________________________________________________")261 start_time = time.perf_counter()262263 text = pd.read_csv(file_text, sep = '\|\|', engine='python')264 text.index.name = "ID"265 text.columns = ["Text"]266 variant = pd.read_csv(file_variant)267 variant.set_index("ID",inplace = True)268 269 concatenate_data = pd.merge(variant, text, on="ID").dropna()270 concatenate_data["Text"] = concatenate_data.apply(lambda line: clean_text(line["Text"]), axis = 1)271 concatenate_data["Variation"] = variant["Variation"].apply(lambda line: line.lower())272 273274 clean_match_data = concatenate_data.apply(lambda x: extract_match(x), axis = 1)275 clean_match = pd.DataFrame(list(clean_match_data), columns = ["Text","Score"], index = clean_match_data.index)276 clean_match.index.name = "ID"277278 new_data = pd.merge(concatenate_data,clean_match, on = "ID")279 if(is_training):280 final_data = new_data.loc[:,["Gene","Variation","Class","Text_y","Score"]]281 final_data.columns = ["Gene","Variation","Class","Text","Score"]282 dtf = pd.merge(pd.DataFrame(final_data.index), final_data, on ="ID")283 np.savetxt(file_out,dtf, fmt = "%d|||%s|||%s|||%d|||%s|||%d", header= "|||".join(dtf.columns), comments='')284 else:285 final_data = new_data.loc[:,["Gene","Variation","Text_y","Score"]]286 final_data.columns = ["Gene","Variation","Text","Score"]287 dtf = pd.merge(pd.DataFrame(final_data.index), final_data, on ="ID")288 np.savetxt(file_out,dtf, fmt = "%d|||%s|||%s|||%s|||%d", header= "|||".join(dtf.columns), comments='')289290 291 stop_time = time.perf_counter()292 print("____________________________________________________________________")293 print("Cleaning datas finished in {} seconds".format(stop_time-start_time))294295296297def main(is_training):298 if is_training:299 file_text = "datas/training_text"300 file_variant = "datas/training_variants"301 file_out = "datas/training_clean"302 else:303 file_text = "datas/test_text"304 file_variant = "datas/test_variants"305 file_out = "datas/test_clean"306 prepare_datas(file_text, file_variant, file_out, is_training)307308if __name__ == "__main__": ...

Full Screen

Full Screen

MatchPattern.py

Source:MatchPattern.py Github

copy

Full Screen

1import re2import abc3from abc import abstractmethod, ABCMeta4from MatchResult import MatchResult5__all__ = ['MatchChoices', 'MatchEq', 'MatchMulti', 'MatchOptional',6 'MatchPattern', 'MatchPlus', 'MatchResult', 'MatchRex',7 'MatchSequence', 'MatchStar', 'MatchString']8def match_str(prefix, string, start, end):9 if not string.startswith(prefix, start, end):10 return None11 end = start + len(prefix)12 #print('match_str', repr(prefix), repr(string[start:end]))13 return MatchResult(string, start, end, type=match_str, data=prefix, children=None)14def match_rex(rex, string, start, end):15 m = re.search(rex, string[start : end])16 if (not m) or m.start() != 0:17 return None18 end = start + (m.end() - m.start())19 #print('rex: ', string[start: end])20 return MatchResult(string, start, end, type=match_rex, data=rex, children=None)21skip_follow_chars_by_rex = r'((?=[^\n])\s)*'22class MatchPattern(metaclass=ABCMeta):23 @staticmethod24 @abstractmethod25 def default_factory():26 raise NotImplementedError('default_factory @MatchPattern abstractmethod')27 @abstractmethod28 def _match(self):29 raise NotImplementedError('_match @MatchPattern abstractmethod')30 31 32 def __init__(self, *, name='', follow=skip_follow_chars_by_rex):33 self.name = name34 self.follow = follow35## if follow is not skip_follow_chars_by_rex:36## print(name, repr(follow))37 return38 def match(self, string, start, end):39## print('name = ', self.name)40## print('type = ', type(self))41## print('string from ', string[start:start+30])42 r = self._match(string, start, end)43 if r != None:44 r.type = type(self)45 assert not r.name46 r.set_name(self.name)47 48 if (self.follow):49 m = re.search(self.follow, r.string[r.end : end])50## if m:51## print(m.group())52## print(repr(self.follow), repr(r.string[r.end : end]), r.end, end)53 if m and m.start() == 0:54 assert not r.follow55 r.append_follow(r.string[r.end : r.end+m.end()])56## print('r.follow = {!r}'.format(r.follow))57## print('r.substring = {!r}'.format(r.string[r.start:r.end]))58## print('r.type = {!r}'.format(r.type))59## print('r.data = {!r}'.format(r.data))60## else:61## print('r = ', r)62 return r63 def __call__(self, string, start, end):64 return self.match(string, start, end)65 def __repr__(self):66 return '<{type}(name={name})>'.format(type=type(self), name=self.name)67class MatchEq(MatchPattern):68 @staticmethod69 def default_factory():70 return MatchEq(None)71 72 def __init__(self, matchPattern, *, name='', follow=skip_follow_chars_by_rex):73 super().__init__(name=name, follow=follow)74 self.matchPattern = matchPattern75 return76 def _match(self, string, start, end):77 return eq_match(self.matchPattern, string, start, end)78 79class MatchString(MatchPattern):80 @staticmethod81 def default_factory():82 return MatchString('')83 84 def __init__(self, string, *, name='', follow=skip_follow_chars_by_rex):85 super().__init__(name=name, follow=follow)86 self.string = string87 return88 def _match(self, string, start, end):89 return match_str(self.string, string, start, end)90class MatchRex(MatchPattern):91 @staticmethod92 def default_factory():93 return MatchRex('')94 95 def __init__(self, rex, *, name='', follow=skip_follow_chars_by_rex):96 super().__init__(name=name, follow=follow)97 self.rex = rex98 return99 def _match(self, string, start, end):100 return match_rex(self.rex, string, start, end)101class MatchMulti(MatchPattern):102 @staticmethod103 def default_factory():104 return MatchMulti(None, 0)105 106 def __init__(self, matchPattern, min, max=float('inf'), *, name='', follow=skip_follow_chars_by_rex):107 super().__init__(name=name, follow=follow)108 109 assert 0 <= min < min+1 <= max110 assert isinstance(min, int)111 112 self.matchPattern = matchPattern113 self.min = min114 self.max = max115 return116 def _match(self, string, start, end):117 return repeat_match(self.matchPattern, \118 string, start, end, self.min, self.max)119 120class MatchSequence(MatchPattern):121 @staticmethod122 def default_factory():123 return MatchSequence([])124 125 def __init__(self, matchPatternList, *, name='', follow=skip_follow_chars_by_rex):126 super().__init__(name=name, follow=follow)127 self.matchPatternList = matchPatternList128 #self.match_fs = [p.match for p in matchPatternList]129 return130 def _match(self, string, start, end):131 return sequence_match(self.matchPatternList, string, start, end)132 133 134class MatchChoices(MatchPattern):135 @staticmethod136 def default_factory():137 return MatchChoices([])138 139 def __init__(self, matchPatternList, *, name='', follow=skip_follow_chars_by_rex):140 super().__init__(name=name, follow=follow)141 self.matchPatternList = matchPatternList142 #self.match_fs = [p.match for p in matchPatternList]143 return144 def _match(self, string, start, end):145 return choose_match(self.matchPatternList, string, start, end)146 147class MatchOptional(MatchMulti):148 @staticmethod149 def default_factory():150 return MatchOptional([])151 152 def __init__(self, matchPattern, *, name='', follow=skip_follow_chars_by_rex):153 super().__init__(matchPattern, min=0, max=1, name=name, follow=follow)154 return155 156class MatchStar(MatchMulti):157 @staticmethod158 def default_factory():159 return MatchStar([])160 161 def __init__(self, matchPattern, *, name='', follow=skip_follow_chars_by_rex):162 super().__init__(matchPattern, min=0, name=name, follow=follow)163 return164 165class MatchPlus(MatchMulti):166 @staticmethod167 def default_factory():168 return MatchPlus([])169 170 def __init__(self, matchPattern, *, name='', follow=skip_follow_chars_by_rex):171 super().__init__(matchPattern, min=1, name=name, follow=follow)172 return173def match(match_f, string, start, end):174 if not isinstance(match_f, MatchPattern):175 print(type(match_f), match_f)176 assert isinstance(match_f, MatchPattern)177 return match_f(string, start, end)178def eq_match(match_f, string, start, end):179 r = match(match_f, string, start, end)180 if r == None:181 return None182 rr = MatchResult(string, r.start, r.org_end, type=eq_match, data=None, children=[r])183 rr.append_follow(r.follow)184 return rr185 186def repeat_match(match_f, string, start, end, min, max=float('inf')):187 assert min < max188 189 old_start = start190 ls = []191 for _ in range(min):192 r = match(match_f, string, start, end)193 if r == None:194 return None195 start = r.end196 ls.append(r)197 while len(ls) <= max:198 r = match(match_f, string, start, end)199 if r == None:200 break201 if max == float('inf') and r.start == r.end:202 print(match_f.name)203 print(type(match_f))204 print(string[start:end])205 print(match_f.max)206 raise MemoryError('match empty string infinit times')207 208 start = r.end209 ls.append(r)210 211 return MatchResult(string, old_start, start, type=repeat_match, data=None, children=ls)212def sequence_match(match_fs, string, start, end):213 assert len(match_fs)214 old_start = start215 ls = []216 for match_f in match_fs:217 r = match(match_f, string, start, end)218 if r == None:219 return None220 start = r.end221 ls.append(r)222 223 224 return MatchResult(string, old_start, start, type=sequence_match, data=None, children=ls)225def choose_match(match_fs, string, start, end):226 assert len(match_fs)227 228 ls = []229 for i, match_f in enumerate(match_fs):230 r = match(match_f, string, start, end)231 if r == None:232 continue233 ls.append((i,r))234 235 if not ls:236 return None237 elif len(ls) > 1:238 print('Warning: choose lengthest one from matchs')239 for i, r in ls:240 f = match_fs[i]241 print('\t', f.name)242 243 ir = max(ls, key=lambda ir:ir[-1].length())244 i, r = ir245 246 cs = [None]*len(match_fs)247 cs[i] = r248 return MatchResult(r.string, r.start, r.end, type=choose_match, data=i, children=cs)...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run autotest automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful