Best Python code snippet using behave
rules.py
Source:rules.py
1"""2Spider rules.Scheduler will provide crawling tasks according to the rules and3spiders will parse response content according to the rules.4"""5from config.settings import (6 SPIDER_COMMON_TASK, SPIDER_AJAX_TASK,7 SPIDER_GFW_TASK, SPIDER_AJAX_GFW_TASK,8 INIT_HTTP_QUEUE, VALIDATED_HTTP_QUEUE,9 VALIDATED_HTTPS_QUEUE, TEMP_HTTP_QUEUE,10 TEMP_HTTPS_QUEUE, TTL_HTTP_QUEUE,11 TTL_HTTPS_QUEUE, SPEED_HTTPS_QUEUE,12 SPEED_HTTP_QUEUE, TEMP_WEIBO_QUEUE,13 VALIDATED_WEIBO_QUEUE, TTL_WEIBO_QUEUE,14 SPEED_WEIBO_QUEUE, TEMP_ZHIHU_QUEUE,15 VALIDATED_ZHIHU_QUEUE, TTL_ZHIHU_QUEUE,16 SPEED_ZHIHU_QUEUE)17__all__ = ['CRAWLER_TASKS', 'VALIDATOR_TASKS', 'CRAWLER_TASK_MAPS',18 'TEMP_TASK_MAPS', 'SCORE_MAPS', 'TTL_MAPS',19 'SPEED_MAPS']20CRAWLER_TASKS = [21 {22 'name': 'mogumiao.com',23 'resource': ['http://www.mogumiao.com/proxy/free/listFreeIp',24 'http://www.mogumiao.com/proxy/api/freeIp?count=15'],25 'task_queue': SPIDER_COMMON_TASK,26 'parse_type': 'json',27 'parse_rule': {28 'detail_rule': ['msg'],29 'ip_key': 'ip',30 'port_key': 'port',31 },32 'interval': 5,33 'enable': 1,34 },35 {36 # now we can't get proxies from it,but it required by ip18137 'name': 'xdaili.cn',38 'resource': ['http://www.xdaili.cn:80/ipagent/freeip/getFreeIps?page=1&rows=10'],39 'task_queue': SPIDER_COMMON_TASK,40 'parse_type': 'json',41 'parse_rule': {42 'detail_rule': ['RESULT'],43 'ip_key': 'ip',44 'port_key': 'port',45 },46 'interval': 10,47 'enable': 0,48 },49 {50 'name': 'xicidaili.com',51 'resource': ['http://www.xicidaili.com/nn/%s' % i for i in range(1, 6)] +52 ['http://www.xicidaili.com/wn/%s' % i for i in range(1, 6)] +53 ['http://www.xicidaili.com/wt/%s' % i for i in range(1, 6)],54 'task_queue': SPIDER_COMMON_TASK,55 'parse_type': 'common',56 'parse_rule': {57 'pre_extract_method': 'xpath',58 'pre_extract': '//tr',59 'infos_pos': 1,60 'infos_end': None,61 'detail_rule': 'td::text',62 'ip_pos': 0,63 'port_pos': 1,64 'extract_protocol': True,65 'split_detail': False,66 'protocols': None67 },68 'interval': 60,69 'enable': 170 },71 {72 'name': 'kuaidaili.com',73 'resource': ['https://www.kuaidaili.com/free/inha/%s' % i for i in range(1, 6)] +74 ['https://www.kuaidaili.com/proxylist/%s' % i for i in range(1, 11)],75 'task_queue': SPIDER_COMMON_TASK,76 'parse_type': 'common',77 'parse_rule': {78 'pre_extract_method': 'xpath',79 'pre_extract': '//tr',80 'infos_pos': 4,81 'infos_end': None,82 'detail_rule': 'td::text',83 'ip_pos': 0,84 'port_pos': 1,85 'extract_protocol': True,86 'split_detail': False,87 'protocols': None88 },89 'interval': 60,90 'enable': 191 },92 {93 'name': 'kxdaili.com',94 'resource': [95 'http://www.kxdaili.com/dailiip/%s/%s.html#ip' % (i, j) for i in range(1, 3) for j in range(1, 11)96 ],97 'task_queue': SPIDER_COMMON_TASK,98 'parse_type': 'common',99 'parse_rule': {100 'pre_extract_method': 'xpath',101 'pre_extract': '//tr',102 'infos_pos': 1,103 'infos_end': None,104 'detail_rule': 'td::text',105 'ip_pos': 0,106 'port_pos': 1,107 'extract_protocol': True,108 'split_detail': False,109 'protocols': None110 },111 'interval': 60,112 'enable': 1113 },114 {115 'name': 'mrhinkydink.com',116 'resource': ['http://www.mrhinkydink.com/proxies.htm'],117 'task_queue': SPIDER_COMMON_TASK,118 'parse_type': 'common',119 'parse_rule': {120 'pre_extract_method': 'css',121 'pre_extract': '.text',122 'infos_pos': 1,123 'infos_end': None,124 'detail_rule': 'td::text',125 'ip_pos': 0,126 'port_pos': 1,127 'extract_protocol': True,128 'split_detail': False,129 'protocols': None130 },131 'interval': 2 * 60,132 'enable': 1,133 },134 {135 'name': 'nianshao.me',136 'resource': ['http://www.nianshao.me/?stype=1&page=%s' % i for i in range(1, 11)] +137 ['http://www.nianshao.me/?stype=2&page=%s' % i for i in range(1, 11)] +138 ['http://www.nianshao.me/?stype=5&page=%s' % i for i in range(1, 11)],139 'task_queue': SPIDER_COMMON_TASK,140 'parse_type': 'common',141 'parse_rule': {142 'pre_extract_method': 'xpath',143 'pre_extract': '//tr',144 'infos_pos': 1,145 'infos_end': None,146 'detail_rule': 'td::text',147 'ip_pos': 0,148 'port_pos': 1,149 'extract_protocol': True,150 'split_detail': False,151 'protocols': None152 },153 'interval': 60,154 'enable': 1 # it seems the website is down155 },156 {157 'name': '66ip.cn',158 'resource': ['http://www.66ip.cn/%s.html' % i for i in range(1, 3)] +159 ['http://www.66ip.cn/areaindex_%s/%s.html' % (i, j)160 for i in range(1, 35) for j in range(1, 3)],161 'task_queue': SPIDER_COMMON_TASK,162 'parse_type': 'common',163 'parse_rule': {164 'pre_extract_method': 'xpath',165 'pre_extract': '//tr',166 'infos_pos': 4,167 'infos_end': None,168 'detail_rule': 'td::text',169 'ip_pos': 0,170 'port_pos': 1,171 'extract_protocol': True,172 'split_detail': False,173 'protocols': None174 },175 'interval': 2 * 60,176 'enable': 1177 },178 {179 'name': 'baizhongsou.com',180 'resource': ['http://ip.baizhongsou.com/'],181 'task_queue': SPIDER_COMMON_TASK,182 'parse_type': 'common',183 'parse_rule': {184 'pre_extract_method': 'xpath',185 'pre_extract': '//tr',186 'infos_pos': 1,187 'infos_end': None,188 'detail_rule': 'td::text',189 'ip_pos': 0,190 'port_pos': 1,191 'extract_protocol': True,192 'split_detail': True,193 'protocols': None194 },195 'interval': 30,196 'enable': 1197 },198 {199 'name': 'data5u.com',200 'resource': [201 'http://www.data5u.com/free/index.shtml',202 'http://www.data5u.com/free/gngn/index.shtml',203 'http://www.data5u.com/free/gwgn/index.shtml'204 ],205 'task_queue': SPIDER_COMMON_TASK,206 'parse_type': 'common',207 'parse_rule': {208 'pre_extract_method': 'xpath',209 'pre_extract': '//ul[contains(@class, "l2")]',210 'infos_pos': 0,211 'infos_end': None,212 'detail_rule': 'span li::text',213 'ip_pos': 0,214 'port_pos': 1,215 'extract_protocol': True,216 'split_detail': False,217 'protocols': None218 },219 'interval': 10,220 'enable': 1,221 },222 {223 # can not access224 'name': 'httpsdaili.com',225 'resource': ['http://www.httpsdaili.com/?stype=1&page=%s' % i for i in range(1, 8)],226 'task_queue': SPIDER_COMMON_TASK,227 'parse_type': 'common',228 'parse_rule': {229 'pre_extract_method': 'xpath',230 'pre_extract': '//tr[contains(@class, "odd")]',231 'infos_pos': 0,232 'infos_end': None,233 'detail_rule': 'td::text',234 'ip_pos': 0,235 'port_pos': 1,236 'extract_protocol': True,237 'split_detail': False,238 'protocols': None239 },240 'interval': 3 * 60,241 'enable': 0,242 },243 {244 'name': 'ip181.com',245 'resource': ['http://www.ip181.com/'] +246 ['http://www.ip181.com/daili/%s.html' % i for i in range(1, 20)],247 'task_queue': SPIDER_COMMON_TASK,248 'parse_type': 'common',249 'parse_rule': {250 'pre_extract_method': 'xpath',251 'pre_extract': '//tr',252 'infos_pos': 1,253 'infos_end': None,254 'detail_rule': 'td::text',255 'ip_pos': 0,256 'port_pos': 1,257 'extract_protocol': True,258 'split_detail': False,259 'protocols': None260 },261 'interval': 10,262 'enable': 1,263 },264 {265 'name': 'ip3366.net',266 'resource': ['http://www.ip3366.net/free/?stype=1&page=%s' % i for i in range(1, 3)] +267 ['http://www.ip3366.net/free/?stype=3&page=%s' % i for i in range(1, 3)],268 'task_queue': SPIDER_COMMON_TASK,269 'parse_type': 'common',270 'parse_rule': {271 'pre_extract_method': 'xpath',272 'pre_extract': '//tr',273 'infos_pos': 1,274 'infos_end': None,275 'detail_rule': 'td::text',276 'ip_pos': 0,277 'port_pos': 1,278 'extract_protocol': True,279 'split_detail': False,280 'protocols': None281 },282 'interval': 30,283 'enable': 1284 },285 {286 'name': 'iphai.com',287 'resource': [288 'http://www.iphai.com/free/ng',289 'http://www.iphai.com/free/wg',290 'http://www.iphai.com/free/np',291 'http://www.iphai.com/free/wp',292 'http://www.iphai.com/'293 ],294 'task_queue': SPIDER_COMMON_TASK,295 'parse_type': 'common',296 'parse_rule': {297 'pre_extract_method': 'xpath',298 'pre_extract': '//tr',299 'infos_pos': 1,300 'infos_end': None,301 'detail_rule': 'td::text',302 'ip_pos': 0,303 'port_pos': 1,304 'extract_protocol': True,305 'split_detail': False,306 'protocols': None307 },308 'interval': 60,309 'enable': 1,310 },311 {312 'name': 'swei360.com',313 'resource': ['http://www.swei360.com/free/?page=%s' % i for i in range(1, 4)] +314 ['http://www.swei360.com/free/?stype=3&page=%s' % i for i in range(1, 4)],315 'task_queue': SPIDER_COMMON_TASK,316 'parse_type': 'common',317 'parse_rule': {318 'pre_extract_method': 'xpath',319 'pre_extract': '//tr',320 'infos_pos': 1,321 'infos_end': None,322 'detail_rule': 'td::text',323 'ip_pos': 0,324 'port_pos': 1,325 'extract_protocol': True,326 'split_detail': False,327 'protocols': None328 },329 'interval': 30,330 'enable': 1,331 },332 {333 'name': 'yundaili.com',334 'resource': [335 'http://www.yun-daili.com/free.asp?stype=1',336 'http://www.yun-daili.com/free.asp?stype=2',337 'http://www.yun-daili.com/free.asp?stype=3',338 'http://www.yun-daili.com/free.asp?stype=4',339 ],340 'task_queue': SPIDER_COMMON_TASK,341 'parse_type': 'common',342 'parse_rule': {343 'pre_extract_method': 'xpath',344 'pre_extract': '//tr[contains(@class, "odd")]',345 'infos_pos': 0,346 'infos_end': None,347 'detail_rule': 'td::text',348 'ip_pos': 0,349 'port_pos': 1,350 'extract_protocol': True,351 'split_detail': False,352 'protocols': None353 },354 'interval': 6 * 60,355 'enable': 1,356 },357 {358 'name': 'ab57.ru',359 'resource': ['http://ab57.ru/downloads/proxyold.txt'],360 'task_queue': SPIDER_COMMON_TASK,361 'parse_type': 'text',362 'parse_rule': {363 'pre_extract': None,364 'delimiter': '\r\n',365 'redundancy': None,366 'protocols': None367 },368 'interval': 60,369 'enable': 1,370 },371 {372 'name': 'proxylists.net',373 'resource': ['http://www.proxylists.net/http_highanon.txt'],374 'parse_type': 'text',375 'task_queue': SPIDER_COMMON_TASK,376 'parse_rule': {377 'pre_extract': None,378 'delimiter': '\r\n',379 'redundancy': None,380 'protocols': None381 },382 'interval': 60,383 'enable': 1,384 },385 {386 'name': 'my-proxy.com',387 'resource': [388 'https://www.my-proxy.com/free-elite-proxy.html',389 'https://www.my-proxy.com/free-anonymous-proxy.html',390 'https://www.my-proxy.com/free-socks-4-proxy.html',391 'https://www.my-proxy.com/free-socks-5-proxy.html'392 ],393 'task_queue': SPIDER_COMMON_TASK,394 # if the parse method is specified, set it in the Spider's parser_maps395 'parse_type': 'myproxy',396 'interval': 60,397 'enable': 1,398 },399 {400 'name': 'us-proxy.org',401 'resource': ['https://www.us-proxy.org/'],402 'task_queue': SPIDER_COMMON_TASK,403 'parse_type': 'common',404 'parse_rule': {405 'pre_extract_method': 'xpath',406 'pre_extract': '//tbody//tr',407 'infos_pos': 0,408 'infos_end': None,409 'detail_rule': 'td::text',410 'ip_pos': 0,411 'port_pos': 1,412 'extract_protocol': True,413 'split_detail': False,414 'protocols': None415 },416 'interval': 60,417 'enable': 1,418 },419 {420 'name': 'socks-proxy.net',421 'resource': [422 'https://www.socks-proxy.net/',423 ],424 'task_queue': SPIDER_COMMON_TASK,425 'parse_type': 'common',426 'parse_rule': {427 'pre_extract_method': 'xpath',428 'pre_extract': '//tbody//tr',429 'infos_pos': 0,430 'infos_end': None,431 'detail_rule': 'td::text',432 'ip_pos': 0,433 'port_pos': 1,434 'extract_protocol': True,435 'split_detail': False,436 'protocols': None437 },438 'interval': 60,439 'enable': 1,440 },441 {442 'name': 'sslproxies.org/',443 'resource': ['https://www.sslproxies.org/'],444 'task_queue': SPIDER_COMMON_TASK,445 'parse_type': 'common',446 'parse_rule': {447 'pre_extract_method': 'xpath',448 'pre_extract': '//tbody//tr',449 'infos_pos': 0,450 'infos_end': None,451 'detail_rule': 'td::text',452 'ip_pos': 0,453 'port_pos': 1,454 'extract_protocol': True,455 'split_detail': False,456 'protocols': None457 },458 'interval': 60,459 'enable': 1,460 },461 {462 'name': 'atomintersoft.com',463 'resource': [464 'http://www.atomintersoft.com/high_anonymity_elite_proxy_list',465 'http://www.atomintersoft.com/anonymous_proxy_list',466 ],467 'task_queue': SPIDER_COMMON_TASK,468 'parse_type': 'common',469 'parse_rule': {470 'pre_extract_method': 'xpath',471 'pre_extract': '//tr',472 'infos_pos': 1,473 'infos_end': None,474 'detail_rule': 'td::text',475 'ip_pos': 0,476 'port_pos': 1,477 'extract_protocol': True,478 'split_detail': True,479 'protocols': None480 },481 'interval': 60,482 'enable': 1,483 },484 {485 'name': 'rmccurdy.com',486 'resource': [487 'https://www.rmccurdy.com/scripts/proxy/good.txt'488 ],489 'task_queue': SPIDER_COMMON_TASK,490 'parse_type': 'text',491 'parse_rule': {492 'pre_extract': None,493 'delimiter': '\n',494 'redundancy': None,495 'protocols': None496 },497 'interval': 60,498 'enable': 1,499 },500 {501 # there are some problems using crawlspider, so we use basic spider502 'name': 'coderbusy.com',503 'resource': ['https://proxy.coderbusy.com/'] +504 ['https://proxy.coderbusy.com/classical/https-ready.aspx?page=%s' % i for i in range(1, 21)] +505 ['https://proxy.coderbusy.com/classical/post-ready.aspx?page=%s' % i for i in range(1, 21)] +506 ['https://proxy.coderbusy.com/classical/anonymous-type/anonymous.aspx?page=%s'507 % i for i in range(1, 6)] +508 ['https://proxy.coderbusy.com/classical/anonymous-type/highanonymous.aspx?page=%s'509 % i for i in range(1, 6)] +510 ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=%s' % i for i in range(1, 21)] +511 ['https://proxy.coderbusy.com/classical/country/us.aspx?page=%s' % i for i in range(1, 11)] +512 ['https://proxy.coderbusy.com/classical/country/id.aspx?page=%s' % i for i in range(1, 6)] +513 ['https://proxy.coderbusy.com/classical/country/ru.aspx?page=%s' % i for i in range(1, 6)],514 'task_queue': SPIDER_AJAX_TASK,515 'parse_type': 'common',516 'parse_rule': {517 'pre_extract_method': 'xpath',518 'pre_extract': '//tr',519 'infos_pos': 1,520 'infos_end': None,521 'detail_rule': 'td::text',522 'ip_pos': 1,523 'port_pos': 2,524 'extract_protocol': False,525 'split_detail': False,526 'protocols': None527 },528 'interval': 2 * 60,529 'enable': 1,530 },531 {532 'name': 'proxydb.net',533 'resource': ['http://proxydb.net/?offset=%s' % (15 * i) for i in range(20)],534 'task_queue': SPIDER_AJAX_TASK,535 'parse_type': 'common',536 'parse_rule': {537 'detail_rule': 'a::text',538 'split_detail': True,539 },540 'interval': 3 * 60,541 'enable': 1,542 },543 {544 'name': 'cool-proxy.net',545 'resource': ['https://www.cool-proxy.net/proxies/http_proxy_list/country_code:/port:/anonymous:1/page:%s'546 % i for i in range(1, 11)],547 'task_queue': SPIDER_AJAX_TASK,548 'parse_type': 'common',549 'parse_rule': {550 'pre_extract_method': 'xpath',551 'pre_extract': '//tr',552 'infos_pos': 1,553 'infos_end': -1,554 'detail_rule': 'td::text',555 'ip_pos': 0,556 'port_pos': 1,557 'extract_protocol': True,558 'split_detail': False,559 'protocols': None560 },561 'interval': 30,562 'enable': 1,563 },564 {565 'name': 'goubanjia.com',566 'resource': ['http://www.goubanjia.com/'],567 'task_queue': SPIDER_AJAX_TASK,568 'parse_type': 'goubanjia',569 'interval': 10,570 'enable': 1,571 },572 {573 'name': 'cn-proxy.com',574 'resource': [575 'http://cn-proxy.com/',576 'http://cn-proxy.com/archives/218'577 ],578 'task_queue': SPIDER_GFW_TASK,579 'parse_type': 'common',580 'parse_rule': {581 'pre_extract_method': 'xpath',582 'pre_extract': '//tbody//tr',583 'infos_pos': 0,584 'infos_end': None,585 'detail_rule': 'td::text',586 'ip_pos': 0,587 'port_pos': 1,588 'extract_protocol': True,589 'split_detail': False,590 'protocols': None591 },592 'interval': 60,593 'enable': 1,594 },595 {596 'name': 'free-proxy-list.net',597 'resource': [598 'https://free-proxy-list.net/',599 'https://free-proxy-list.net/uk-proxy.html',600 'https://free-proxy-list.net/anonymous-proxy.html',601 ],602 'task_queue': SPIDER_GFW_TASK,603 'parse_type': 'common',604 'parse_rule': {605 'pre_extract_method': 'xpath',606 'pre_extract': '//tbody//tr',607 'infos_pos': 0,608 'infos_end': None,609 'detail_rule': 'td::text',610 'ip_pos': 0,611 'port_pos': 1,612 'extract_protocol': True,613 'split_detail': False,614 'protocols': None615 },616 'interval': 60,617 'enable': 1,618 },619 {620 'name': 'xroxy',621 'resource': ['http://www.xroxy.com/proxylist.php?port=&type=&ssl=&country=&latency=&reliability=&'622 'sort=reliability&desc=true&pnum=%s#table' % i for i in range(20)],623 'task_queue': SPIDER_GFW_TASK,624 'parse_type': 'xroxy',625 'interval': 60,626 'enable': 1,627 },628 {629 'name': 'proxylistplus',630 'resource': [631 'http://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1',632 'http://list.proxylistplus.com/SSL-List-1'633 ],634 'task_queue': SPIDER_GFW_TASK,635 'parse_type': 'common',636 'parse_rule': {637 'pre_extract_method': 'xpath',638 'pre_extract': '//tr[contains(@class, "cells")]',639 'infos_pos': 1,640 'infos_end': -1,641 'detail_rule': 'td::text',642 'ip_pos': 0,643 'port_pos': 1,644 'extract_protocol': False,645 'split_detail': False,646 'protocols': None647 },648 'interval': 3 * 60,649 'enable': 1,650 },651 {652 'name': 'cnproxy.com',653 'resource': ['http://www.cnproxy.com/proxy%s.html' % i for i in range(1, 11)] +654 ['http://www.cnproxy.com/proxyedu%s.html' % i for i in range(1, 3)],655 'task_queue': SPIDER_AJAX_GFW_TASK,656 'parse_type': 'cnproxy',657 'interval': 60,658 'enable': 1,659 },660 {661 'name': 'free-proxy.cz',662 'resource': ['http://free-proxy.cz/en/proxylist/main/%s' % i for i in range(1, 30)],663 'task_queue': SPIDER_AJAX_GFW_TASK,664 'parse_type': 'free-proxy',665 'interval': 3 * 60,666 'enable': 1,667 },668 {669 'name': 'proxy-list.org',670 'resource': ['https://proxy-list.org/english/index.php?p=%s' % i for i in range(1, 11)],671 'task_queue': SPIDER_AJAX_GFW_TASK,672 'parse_type': 'common',673 'parse_rule': {674 'pre_extract_method': 'css',675 'pre_extract': '.table ul',676 'infos_pos': 1,677 'infos_end': None,678 'detail_rule': 'li::text',679 'ip_pos': 0,680 'port_pos': 1,681 'extract_protocol': True,682 'split_detail': True,683 'protocols': None684 },685 'interval': 60,686 'enable': 1,687 },688 {689 'name': 'gatherproxy',690 'resource': [691 'http://www.gatherproxy.com/',692 'http://www.gatherproxy.com/proxylist/anonymity/?t=Elite',693 'http://www.gatherproxy.com/proxylist/anonymity/?t=Anonymous',694 'http://www.gatherproxy.com/proxylist/country/?c=China',695 'http://www.gatherproxy.com/proxylist/country/?c=Brazil',696 'http://www.gatherproxy.com/proxylist/country/?c=Indonesia',697 'http://www.gatherproxy.com/proxylist/country/?c=Russia',698 'http://www.gatherproxy.com/proxylist/country/?c=United%20States',699 'http://www.gatherproxy.com/proxylist/country/?c=Thailand',700 'http://www.gatherproxy.com/proxylist/port/8080',701 'http://www.gatherproxy.com/proxylist/port/3128',702 'http://www.gatherproxy.com/proxylist/port/80',703 'http://www.gatherproxy.com/proxylist/port/8118'704 ],705 'task_queue': SPIDER_AJAX_GFW_TASK,706 'parse_type': 'common',707 'parse_rule': {708 'pre_extract_method': 'xpath',709 'pre_extract': '//tr',710 'infos_pos': 1,711 'infos_end': None,712 'detail_rule': 'td::text',713 'ip_pos': 0,714 'port_pos': 1,715 'extract_protocol': True,716 'split_detail': False,717 'protocols': None718 },719 'interval': 60,720 'enable': 1,721 },722]723# crawler will fetch tasks from the following queues724CRAWLER_TASK_MAPS = {725 'common': SPIDER_COMMON_TASK,726 'ajax': SPIDER_AJAX_TASK,727 'gfw': SPIDER_GFW_TASK,728 'ajax_gfw': SPIDER_AJAX_GFW_TASK729}730# validator scheduler will fetch tasks from resource queue and store into task queue731VALIDATOR_TASKS = [732 {733 'name': 'http',734 'task_queue': TEMP_HTTP_QUEUE,735 'resource': VALIDATED_HTTP_QUEUE,736 'interval': 5, # 20 minutes737 'enable': 1,738 },739 {740 'name': 'https',741 'task_queue': TEMP_HTTPS_QUEUE,742 'resource': VALIDATED_HTTPS_QUEUE,743 'interval': 5,744 'enable': 1,745 },746 {747 'name': 'weibo',748 'task_queue': TEMP_WEIBO_QUEUE,749 'resource': VALIDATED_WEIBO_QUEUE,750 'interval': 5,751 'enable': 1,752 },753 {754 'name': 'zhihu',755 'task_queue': TEMP_ZHIHU_QUEUE,756 'resource': VALIDATED_ZHIHU_QUEUE,757 'interval': 5,758 'enable': 1,759 },760]761# validators will fetch proxies from the following queues762TEMP_TASK_MAPS = {763 'init': INIT_HTTP_QUEUE,764 'http': TEMP_HTTP_QUEUE,765 'https': TEMP_HTTPS_QUEUE,766 'weibo': TEMP_WEIBO_QUEUE,767 'zhihu': TEMP_ZHIHU_QUEUE768}769# target website that use http protocol770HTTP_TASKS = ['http']771# target website that use https protocol772HTTPS_TASKS = ['https', 'zhihu', 'weibo']773# todo the three maps may be combined in one map774# validator scheduler and clients will fetch proxies from the following queues775SCORE_MAPS = {776 'http': VALIDATED_HTTP_QUEUE,777 'https': VALIDATED_HTTPS_QUEUE,778 'weibo': VALIDATED_WEIBO_QUEUE,779 'zhihu': VALIDATED_ZHIHU_QUEUE780}781# validator scheduler and clients will fetch proxies from the following queues which are verified recently782TTL_MAPS = {783 'http': TTL_HTTP_QUEUE,784 'https': TTL_HTTPS_QUEUE,785 'weibo': TTL_WEIBO_QUEUE,786 'zhihu': TTL_ZHIHU_QUEUE787}788SPEED_MAPS = {789 'http': SPEED_HTTP_QUEUE,790 'https': SPEED_HTTPS_QUEUE,791 'weibo': SPEED_WEIBO_QUEUE,792 'zhihu': SPEED_ZHIHU_QUEUE...
test_parse.py
Source:test_parse.py
...31 cls.rule7 = "A(x[1]),B(x[1]) -> A(x[.]),. @ 1"32 cls.obs0 = "%obs: 'ste5 dimerized' |Ste5(ste5[1]),Ste5(ste5[1])|"33 def test_rule_parse(self):34 mds = [readers.KappaReader.parse_mtype(x) for x in self.mds]35 rule0s = readers.KappaReader.parse_rule(self.rule0, mds)36 assert len(rule0s) == 137 assert not rule0s[0].rev38 rule1s = readers.KappaReader.parse_rule(self.rule1, mds)39 assert len(rule1s[0].lhs) == len(rule0s[0].lhs)40 for i in range(len(rule1s[0].lhs)):41 assert rule1s[0].lhs[i].is_isomorphic(rule0s[0].lhs[i])42 assert rule1s[0].label == 'rule'43 rule2s = readers.KappaReader.parse_rule(self.rule2, mds)44 assert len(rule2s) == 445 for r in rule2s:46 assert not r.rev47 rule3s = readers.KappaReader.parse_rule(self.rule3, mds)48 assert len(rule3s) == 349 assert rule3s[0].label == 'label with space'50 rule4s = readers.KappaReader.parse_rule(self.rule4, mds)51 assert len(rule4s) == 152 assert rule4s[0].rev53 rule5s = readers.KappaReader.parse_rule(self.rule5, mds)54 print rule5s[0]55 assert len(rule5s[0].lhs) == 156 assert len(rule5s[0].lhs[0]) == 157 assert rule5s[0].lhs[0][0] == objects.PlaceHolderMolecule()58 assert rule5s[0].rev59 rule6s = readers.KappaReader.parse_rule(self.rule6, mds)60 print rule6s61 assert len(rule6s[0].rhs) == 162 assert len(rule6s[0].rhs[0]) == 163 assert rule6s[0].rhs[0][0] == objects.PlaceHolderMolecule()64 assert rule6s[0].delmol65 rule7s = readers.KappaReader.parse_rule(self.rule7, mds)66 assert len(rule7s[0].rhs) == 267 print rule7s[0].rhs68 assert isinstance(rule7s[0].rhs[1][0], objects.PlaceHolderMolecule)69 def test_cpattern_parse(self):70 pmdef2 = readers.KappaReader.parse_mtype(self.mdef2)71 pmdef3 = readers.KappaReader.parse_mtype(self.mdef3)72 pmdef4 = readers.KappaReader.parse_mtype(self.mdef4)73 pcp0 = readers.KappaReader.parse_cpatterns(self.cp0, [pmdef2, pmdef3, pmdef4])74 assert len(pcp0) == 275 assert len(pcp0[1].molecule_list) == 276 assert len(pcp0[0].molecule_list) == 177 assert pcp0[1].molecule_list[1].sites[0].bond.num == 178 assert not pcp0[0].molecule_list[0].sites79 assert pcp0[1].molecule_list[1].sites[1].state == 's'80 def test_init_parse(self):81 pmdef2 = readers.KappaReader.parse_mtype(self.mdef2)82 assert readers.KappaReader.parse_init(self.init0, [pmdef2])[0].write_as_kappa() == "%init: 10 A(x[.])"83 bdef = objects.MoleculeDef('B', [], {})84 cdef = objects.MoleculeDef('C', [], {})85 assert readers.KappaReader.parse_init(self.init1, [bdef, cdef])[0].write_as_kappa() == "%init: 10 + 'x' B()"86 assert readers.KappaReader.parse_init(self.init1, [bdef, cdef])[1].write_as_kappa() == "%init: 10 + 'x' C()"87 def test_eq_parse(self):88 assert readers.KappaReader.parse_alg_expr(self.expr0).asList() == ['10', '+', 'x']89 assert readers.KappaReader.parse_alg_expr(self.expr1).asList() == \90 ['[log]', '100', '/', '[max]', '10', '100', '-', '[int]', '7.342']91 def test_mdef_parse(self):92 assert readers.KappaReader.parse_mtype(self.mdef0).write_as_kappa() == "%agent: M(x,y,z{a,b})"93 def test_mol_parse(self):94 pmdef0 = readers.KappaReader.parse_mtype(self.mdef0)95 assert readers.KappaReader.parse_molecule(self.mol0, [pmdef0]).write_as_kappa() == "M(x[.],y[.],z{a}[.])"96 pmol1 = readers.KappaReader.parse_molecule(self.mol1, [pmdef0])97 assert pmol1.write_as_kappa() == "M(x[.],y[.],z{a}[_])"98 assert pmol1.sites[2].bond.wild99 pmdef1 = readers.KappaReader.parse_mtype(self.mdef1)100 pmol2 = readers.KappaReader.parse_molecule(self.mol2, [pmdef1])101 assert pmol2.write_as_kappa() == "M+_2-985798f(x[#],y[.],z{a}[#])"102 assert pmol2.sites[0].bond.any103 assert pmol2.name == "M+_2-985798f"104 def test_vars_parse(self):105 kr = readers.KappaReader()106 kr.lines = ["%agent: C(x, y{state state2})", "%agent: A()", "%agent: Ste5(ste5, ste4)", "%agent: Ste4(ste5)",107 "%var: 'a' 3", "%var: 'b' 3 + 'a'", "%var: 'c' |C(x[_],y{state})|", "%var: 'd' |A()| + 'b'",108 "%obs: 'membrane Ste5' |Ste5(ste4[1]),Ste4(ste5[1])|", "%var: 'combo' 'membrane Ste5' / 'a'", self.obs0]109 model = kr.parse()110 assert model.molecules[0].name == 'C'111 assert model.molecules[0].site_name_map == {'x': 'x', 'y': 'y'}112 assert len(model.functions) == 2113 assert model.functions[1].name == 'combo'114 assert model.parameters[0].name == 'a'115 assert model.parameters[1].name == 'b'116 assert set(model.parameters[1].value.atom_list) == {'3', '+', 'a'}117 assert isinstance(model.parameters[1].value, objects.Expression)118 assert model.observables[0].write_as_kappa() == "%obs: 'c' |C(x[_],y{state}[#])|"119 assert len(model.observables) == 4120 assert model.observables[1].name == "anon_obs0"121 assert model.observables[-2].name == "membrane Ste5"122 assert len(model.observables[-1].cpatterns) == 1123class TestParseBNGL:124 def __init__(self):125 pass126 @classmethod127 def setup_class(cls):128 cls.mdef0 = "Molecule(site0,site1~U~P~PP)"129 cls.mdef1 = "Mol(a,b~U~P,c~a~b,b~U~P,c~a~b)"130 cls.mdef2 = "Mol(sa,sb,sc,sd~U~P)"131 cls.mdef3 = "Mol(a,b~U~P,sb~U~P)"132 cls.mdef4 = "A(a~r~s)"133 cls.mdef5 = "B(b,c)"134 cls.mdef6 = "C(c)"135 cls.mdef7 = "K(s)"136 cls.mdef8 = "S(k, active~U~P)"137 cls.mds = [cls.mdef4, cls.mdef5, cls.mdef6, cls.mdef7, cls.mdef8]138 cls.mol0 = "Mol(sa,sb!+,sc!3,sd~U!?)"139 cls.mol1 = "Mol(a,b~U,b~P)"140 cls.mol1b = "Mol(a,b~U,b~P) thing"141 cls.mol2 = "Mol(a,b~?!+)"142 cls.init0 = cls.mol0 + ' 100'143 cls.init1 = cls.mol0 + '\t(x+3)/k'144 cls.obs0 = "Molecules Mol0 " + cls.mol0145 cls.obs1 = "Species Mol1 " + cls.mol0146 cls.obs2 = "Species Mol1T Mol1==3"147 cls.param0 = "kcat 1"148 cls.param1 = "kp=km/kd/(NA*V)"149 cls.rule0 = "A(a) + B(b)<->A(a!1).B(b!1) kp,km"150 # intermolecular rate151 cls.rule1 = "A(a~r)+B(b,c!1).C(c!1) -> A(a~r!2).B(b!2,c!1).C(c!1) kp / log10(10)"152 # intramolecular rule153 cls.rule2 = "A(a~s).B(b,c!1).C(c!1) -> A(a~s!2).B(b!2,c!1).C(c!1) kp/log10(10)"154 cls.rule3 = "K(s!1).S(k!1,active~U!?) -> K(s!1).S(k!1,active~P!?) kcat + 1"155 cls.rule4 = "A() <-> 0 rate, rate"156 cls.rule5 = "0 -> B(x) 4"157 cls.rule6 = "bdeg: B(x!+) -> 0 kdeg DeleteMolecules"158 cls.rule7 = "K(s!1).S(k!1,active~U!?) -> K(s!1) + S(k!1,active~U!?) k_dissoc()"159 cls.rule8 = "A() + A() -> 0 rate"160 cls.rule9 = "A().A().A() -> 0 rate"161 cls.rule10 = "0 -> A().B() rate"162 @classmethod163 def teardown_class(cls):164 pass165 def test_mdef_parse(self):166 assert readers.BNGLReader.parse_mtype(self.mdef0).write_as_bngl() == self.mdef0167 md1 = readers.BNGLReader.parse_mtype(self.mdef1)168 md1.site_name_map['b0'] = 'b'169 md1.site_name_map['b1'] = 'b'170 assert md1.write_as_bngl() == "Mol(a,b~U~P,c~a~b,b~U~P,c~a~b)"171 def test_mol_parse(self):172 pmdef2 = readers.BNGLReader.parse_mtype(self.mdef2)173 assert readers.BNGLReader.parse_molecule(self.mol0, [pmdef2]).write_as_bngl() == self.mol0174 pmdef3 = readers.BNGLReader.parse_mtype(self.mdef3)175 mol1 = readers.BNGLReader.parse_molecule(self.mol1, [pmdef3])176 assert mol1.write_as_bngl() == "Mol(a,b~U,b~P)"177 mol2 = readers.BNGLReader.parse_molecule(self.mol2, [pmdef3])178 assert mol2.write_as_bngl() == "Mol(a,b~?!+)"179 print mol2.write_as_kappa()180 assert mol2.write_as_kappa() == "Mol(a[.],b{#}[_])"181 @raises(rbexceptions.NotAMoleculeException)182 def test_bmol_parse(self):183 pmdef3 = readers.BNGLReader.parse_mtype(self.mdef3)184 readers.BNGLReader.parse_molecule(self.mol1b, [pmdef3])185 def test_init_parse(self):186 pmd2 = readers.BNGLReader.parse_mtype(self.mdef2)187 assert readers.BNGLReader.parse_init(self.init0, [pmd2]).write_as_bngl() == self.mol0 + ' 100.0'188 assert readers.BNGLReader.parse_init(self.init1, [pmd2]).write_as_bngl(189 {'x': 'x', 'k': 'k'}) == self.mol0 + ' (x+3)/k'190 def test_obs_parse(self):191 pmdef2 = readers.BNGLReader.parse_mtype(self.mdef2)192 pmdef3 = readers.BNGLReader.parse_mtype(self.mdef3)193 assert readers.BNGLReader.parse_obs(self.obs0, [pmdef2]).write_as_bngl({"Mol0": "Mol0"}) == self.obs0194 assert readers.BNGLReader.parse_obs(self.obs1, [pmdef3]).write_as_bngl({"Mol1": "Mol1"}) == self.obs1195 assert readers.BNGLReader.parse_obs(self.obs2, [pmdef3]) is None196 def test_params_parse(self):197 assert readers.BNGLReader.parse_param(self.param0).write_as_bngl({"kcat": "kcat"}) == self.param0198 namespace = {"kp": "kp", "km": "km", "kd": "kd", "NA": "NA", "V": "V"}199 assert readers.BNGLReader.parse_param(self.param1).write_as_bngl(namespace) == "kp km/kd/(NA*V)"200 def test_rule_parse(self):201 mds = [readers.BNGLReader.parse_mtype(x) for x in self.mds]202 prule0 = readers.BNGLReader.parse_rule(self.rule0, mds)203 assert prule0.rev is True204 assert prule0.write_as_bngl({"kp": "kp", "km": "km"}) == "A(a)+B(b) <-> A(a!1).B(b!1) kp,km"205 prule1 = readers.BNGLReader.parse_rule(self.rule1, mds)206 assert prule1.write_as_bngl({"kp": "kp"}) == "A(a~r)+B(b,c!1).C(c!1) -> A(a~r!2).B(b!2,c!1).C(c!1) kp/log10(10)"207 assert prule1.write_as_kappa() == "A(a{r}[.]),B(b[.],c[1]),C(c[1]) -> A(a{r}[2]),B(b[2],c[1]),C(c[1]) @ 'kp'/([log](10)/[log](10))"208 prule2 = readers.BNGLReader.parse_rule(self.rule2, mds)209 assert prule2.rate.intra_binding is True210 assert prule2.write_as_bngl({"kp": "kp"}) == self.rule2211 assert prule2.write_as_kappa() == "A(a{s}[.]),B(b[.],c[1]),C(c[1]) -> A(a{s}[2]),B(b[2],c[1]),C(c[1]) @ 0 {'kp'/([log](10)/[log](10))}"212 prule3 = readers.BNGLReader.parse_rule(self.rule3, mds)213 assert prule3.rate.intra_binding is False214 assert prule3.write_as_bngl({"kcat": "kcat"}) == "K(s!1).S(k!1,active~U!?) -> K(s!1).S(k!1,active~P!?) kcat+1"215 assert prule3.write_as_kappa() == "K(s[1]),S(k[1],active{U}[#]) -> K(s[1]),S(k[1],active{P}[#]) @ 'kcat' + 1"216 prule4 = readers.BNGLReader.parse_rule(self.rule4, mds)217 assert len(prule4.rhs) == 1218 assert prule4.rate.write_as_bngl({"rate": "rate2"}) == 'rate2'219 assert prule4.delmol220 prule5 = readers.BNGLReader.parse_rule(self.rule5, mds)221 assert isinstance(prule5.lhs[0].molecule_list[0], objects.PlaceHolderMolecule)222 assert len(prule5.rhs) == 1223 prule6 = readers.BNGLReader.parse_rule(self.rule6, mds)224 assert prule6.label == 'bdeg'225 assert prule6.rate.rate == 'kdeg'226 assert prule6.delmol227 prule7 = readers.BNGLReader.parse_rule(self.rule7, mds)228 assert len(prule7.rhs) == 2229 prule8 = readers.BNGLReader.parse_rule(self.rule8, mds)230 assert len(prule8.rhs[0].molecule_list) == 1231 assert isinstance(prule8.rhs[0].molecule_list[0], objects.PlaceHolderMolecule)232 prule9 = readers.BNGLReader.parse_rule(self.rule9, mds)233 assert len(prule9.rhs) == 3234 assert isinstance(prule9.rhs[0].molecule_list[0], objects.PlaceHolderMolecule)235 prule10 = readers.BNGLReader.parse_rule(self.rule10, mds)236 assert len(prule10.rhs) == 1237 assert len(prule10.rhs[0].molecule_list) == 2238 assert isinstance(prule10.lhs[0].molecule_list[0], objects.PlaceHolderMolecule)239 assert isinstance(prule10.lhs[1].molecule_list[0], objects.PlaceHolderMolecule)240 @raises(rbexceptions.NotCompatibleException)241 def test_invalid_rule_rate(self):242 lines = ['begin molecule types', 'A(x)', 'end molecule types',243 'begin observables', 'Molecules A A().A()', 'end observables',244 'begin functions', 'f() A / 2', 'end functions',245 'begin reaction rules', 'A(x)+A(x) -> A(x!1).A(x!1) f()', 'end reaction rules']246 br = readers.BNGLReader()247 br.lines = lines248 m = br.parse()249 m.write_as_kappa('test.ka', True)
mds_parser-v04.py
Source:mds_parser-v04.py
1import datetime2import os3from modules import dbmodule4from modules import parsemodule5def get_path_info(_file_path: str):6 file_path, file_basename = os.path.split(_file_path)7 file_name, file_ext = os.path.splitext(file_basename)8 return file_path, file_basename, file_name, file_ext9def text_not_found(client, project, discipline, feeder, file_name, parse_rule):10 rtn_val = [client, project, discipline, discipline, feeder, "Vessel Design"]11 rtn_val.extend([parse_rule.get("seq"), parse_rule.get("category"), parse_rule.get("key_name")])12 rtn_val.extend(["N/A", None, None, file_name, None, None])13 return [rtn_val]14def find_selected_by_radio(data_set):15 rtn_val = "N/A"16 if data_set[-2].startswith("o ") and not data_set[-1].startswith("o "):17 print("case1")18 rtn_val = data_set[-1]19 elif not data_set[-2].startswith("o ") and data_set[-1].startswith("o "):20 rtn_val = data_set[-2]21 print("case2")22 else:23 print("case3")24 pass25 return rtn_val if len(data_set) < 3 or rtn_val == "N/A" else data_set[0] + rtn_val26def parse_line_text(client, project, discipline, feeder, file_name, parse_rule):27 rtn_val = []28 text_data = dbmodule.select_mds_data(client, project, discipline, file_name, parse_rule.get("find_word"))29 for row in text_data:30 print(row.get("content"))31 ext_text = parsemodule.extract_text(row.get("content"), parse_rule.get("extract_rule"))32 print(ext_text)33 if ext_text:34 obj = [row.get("client"), row.get("project"), row.get("discipline"), row.get("discipline"),35 feeder, "Vessel Design"]36 obj.extend([parse_rule.get("seq"), parse_rule.get("category"), parse_rule.get("key_name")])37 if isinstance(ext_text[0], tuple):38 # ext = ext_text[0][0]39 # for t in ext_text[0][1:]:40 # if t.startswith("o "):41 # ext = ext.replace(t, "").strip()42 # print(ext)43 # obj.append(ext)44 obj.append(find_selected_by_radio(ext_text[0]))45 else:46 obj.append(ext_text[0] if ext_text else "N/A")47 obj.extend([row.get("page_total"), row.get("page_no"), row.get("file_name_origin")])48 obj.extend([row.get("num"), row.get("content")])49 rtn_val.append(obj)50 return rtn_val or text_not_found(client, project, discipline, feeder, file_name, parse_rule)51def parse_multi_text(client, project, discipline, feeder, file_name, parse_rule):52 rtn_val = []53 text_data = dbmodule.select_mds_data(client, project, discipline, file_name, parse_rule.get("find_word"))54 for row in text_data:55 next_data = dbmodule.select_next_data(client, project, discipline, file_name,56 parse_rule.get("find_next_word")57 if parse_rule.get("find_next_word") != "ALL LINE READ" else "",58 row.get("page_no"), row.get("num"), parse_rule.get("next_line"))59 for next_row in next_data:60 print(parse_rule.get("extract_rule"), next_row.get("content"))61 ext_text = parsemodule.extract_text(next_row.get("content"), parse_rule.get("extract_rule"))62 print(ext_text)63 if ext_text:64 obj = [next_row.get("client"), next_row.get("project"), next_row.get("discipline"),65 next_row.get("discipline"), feeder, "Vessel Design"]66 obj.extend([parse_rule.get("seq"), parse_rule.get("category"), parse_rule.get("key_name"), ext_text[0]])67 # obj.append(ext_text[0] if ext_text else "N/A")68 obj.extend([next_row.get("page_total"), next_row.get("page_no"), next_row.get("file_name_origin")])69 obj.extend([next_row.get("num"), next_row.get("content")])70 rtn_val.append(obj)71 if len(rtn_val) > 0:72 break73 return rtn_val or text_not_found(client, project, discipline, feeder, file_name, parse_rule)74def parse_sub_text(client, project, discipline, feeder, file_name, parse_rule):75 rtn_val = []76 sub_find_word = "CONDITION 1"77 if project == "Borouge" and feeder == "Tecnimont":78 if parse_rule.get("key_name").find("ext") > -1:79 sub_find_word = "CONDITION 2"80 elif project == "RAPID" and feeder == "Tecnhip":81 if parse_rule.get("category") == "v-4" or parse_rule.get("category") == "v-9":82 sub_find_word = "Pressure"83 else:84 sub_find_word = "Temperature"85 text_data = dbmodule.select_mds_data(client, project, discipline, file_name, parse_rule.get("find_word"))86 for row in text_data:87 next_data = dbmodule.select_next_data(client, project, discipline, file_name,88 parse_rule.get("find_next_word")89 if parse_rule.get("find_next_word") != "ALL LINE READ" else "",90 row.get("page_no"), row.get("num"), parse_rule.get("next_line"))91 for n, next_row in enumerate(next_data):92 if next_row.get("content").find(sub_find_word) > -1:93 for sub_row in next_data[n - 1: n + 2]:94 ext_text = parsemodule.extract_text(sub_row.get("content"), parse_rule.get("extract_rule"))95 if ext_text:96 obj = [client, project, discipline, discipline, feeder, "Vessel Design"]97 obj.extend([parse_rule.get("seq"), parse_rule.get("category"), parse_rule.get("key_name"),98 ext_text[0]])99 obj.extend([sub_row.get("page_total"), sub_row.get("page_no"), sub_row.get("file_name_origin")])100 obj.extend([sub_row.get("num"), sub_row.get("content")])101 rtn_val.append(obj)102 return rtn_val or text_not_found(client, project, discipline, feeder, file_name, parse_rule)103def parse_cropped_image(client, project, discipline, feeder, file_name, parse_rule):104 rtn_val = []105 parsed_set = []106 text_data = dbmodule.select_mds_data(client, project, discipline, file_name, parse_rule.get("find_word"))107 idx = 0108 for row in text_data:109 next_data = dbmodule.select_next_data(client, project, discipline, file_name, "",110 row.get("page_no"), row.get("num"), parse_rule.get("next_line"))111 if client == "ADNOC" and project == "Hail Gasha" and feeder == "BECHTEL":112 parsed_set = parsemodule.cropping_adnoc_bechtel(next_data)113 if client == "ARAMCO" and project == "UNAYZAH" and feeder == "WorleyParsons":114 parsed_set = parsemodule.cropping_aramco_worleyparsons(next_data)115 if client == "BOROUGE" and project == "Borouge" and feeder == "Tecnimont":116 parsed_set = parsemodule.cropping_borouge_tecnimont(next_data)117 if client == "PETRONAS" and project == "RAPID" and feeder == "Tecnhip":118 parsed_set = parsemodule.cropping_petronas_tecnhip(next_data)119 if client == "SAVIC" and project == "JUPC EOEG" and feeder == "WorleyParsons":120 parsed_set = parsemodule.cropping_savic_worleyparsons(next_data)121 for n, parse_text in enumerate(parsed_set):122 idx += 1123 obj = [row.get("client"), row.get("project"), row.get("discipline"), row.get("discipline"),124 feeder, "Vessel Design"]125 obj.extend([parse_rule.get("seq") + (idx - 1), parse_rule.get("category") + str(idx)])126 obj.append(parse_rule.get("key_name"))127 obj.append("^".join(parse_text))128 obj.extend([row.get("page_total"), row.get("page_no"), row.get("file_name_origin")])129 obj.extend([row.get("num"), " ".join(parse_text)])130 rtn_val.append(obj)131 return rtn_val132def runner(pdf_file, feeder):133 # TE_name: "Concrete Strength Calc" -> "Vessel Design"134 start_time = datetime.datetime.now()135 print("MDS parsing ...")136 file_path, file_basename, file_name, file_ext = get_path_info(pdf_file)137 client, project, discipline = file_path.split("/")[-3:]138 result_set = []139 # parsing 룰 ì¡°í140 start_tm = datetime.datetime.now()141 rule_set = dbmodule.select_rules_by_project(project, feeder)142 end_tm = datetime.datetime.now()143 print(f"parsing rule read({end_tm - start_tm})")144 start_tm = end_tm145 for rule in rule_set:146 # print(f"{rule.get('key_name')} parsing....")147 if rule.get("extract_method") == "cropped":148 result_set.extend(parse_cropped_image(client, project, discipline, feeder, file_name, rule))149 # continue150 elif rule.get("extract_method") == "regex":151 if rule.get("find_next_word"):152 result_set.extend(parse_multi_text(client, project, discipline, feeder, file_name, rule))153 else:154 result_set.extend(parse_line_text(client, project, discipline, feeder, file_name, rule))155 # continue156 elif rule.get("extract_method") == "regex_sub_text":157 # print(rule)158 result_set.extend(parse_sub_text(client, project, discipline, feeder, file_name, rule))159 else:160 raise ValueError(f"Unknown parsing method: {rule.get('extract_method')}")161 # continue162 end_tm = datetime.datetime.now()163 # print(f"{rule.get('key_name')} parsed({end_tm - start_tm})")164 print(f"{rule.get('category')} parsed({end_tm - start_tm})")165 start_tm = end_tm166 del_count = dbmodule.delete_result(client, project, discipline, file_name)167 print(f"ìì ìë£: {del_count}ê±´ ìì ")168 result_set.sort(key=lambda x: x[6])169 for r in result_set:170 print(r)171 insert_count = dbmodule.save_result(result_set)172 print(f"ìì
ìë£: {insert_count}ê±´ ì ì¥")173 print(f"MDS parsed ({datetime.datetime.now() - start_time})!")174if __name__ == "__main__":175 # target_file = "samsung-itb/ADNOC/Hail Gasha/MDS/ADNOC-Hail Ghasha-VESSEL MDS-Mark-Up.pdf"176 # target_feeder = "BECHTEL"177 # target_file = "samsung-itb/ARAMCO/UNAYZAH/MDS/ARAMCO-UNAYZAH-VESSEL MDS-Mark-Up.pdf"178 # target_feeder = "WorleyParsons"179 # target_file = "samsung-itb/BOROUGE/Borouge/MDS/BOROUGE-Borouge #4-VESSEL MDS-Mark-Up.pdf"180 # target_feeder = "Tecnimont"181 # target_file = "samsung-itb/PETRONAS/RAPID/MDS/PETRONAS-RAPID-VESSEL PDS-Mark-Up.pdf"182 # target_feeder = "Tecnhip"183 target_file = "samsung-itb/SAVIC/JUPC EOEG/MDS/SAVIC-JUPC EOEG-VESSEL MDS-Mark-Up.pdf"184 target_feeder = "WorleyParsons"...
d19.py
Source:d19.py
...14 messages.append(line.strip())15 return rules, messages16def first_answer(data):17 rules, messages = parse_rules(data)18 def parse_rule(index):19 s = rules[index]20 if LITERAL.match(s):21 return LITERAL.match(s)[1]22 new_rule = '(' + '|'.join(23 ''.join(parse_rule(rule_number.strip()) for rule_number in choice.split()) for choice in s.split('|')) + ')'24 return new_rule25 rule_zero = re.compile(f'^{parse_rule("0")}$')26 return sum(rule_zero.fullmatch(message) is not None for message in messages)27@timer28def second_answer(data):29 rules, messages = parse_rules(data)30 capture_group_name_index = 031 @lru_cache()32 def parse_rule(index):33 if index == '8':34 return f'{parse_rule("42")}+'35 if index == '11':36 nonlocal capture_group_name_index37 capture_group_name_index += 138 rule_42 = parse_rule("42")39 rule_31 = parse_rule("31")40 return '(' + '|'.join(f'({rule_42}{{{n}}}{rule_31}{{{n}}})' for n in range(1, 46)) + ')'41 s = rules[index]42 if LITERAL.match(s):43 return LITERAL.match(s)[1]44 new_rule = '(' + '|'.join(45 ''.join(parse_rule(rule_number.strip()) for rule_number in choice.split()) for choice in s.split('|')) + ')'46 return new_rule47 rule_zero = re.compile(f'^{parse_rule("0")}$')48 return sum(rule_zero.fullmatch(message) is not None for message in messages)49TEST_DATA1 = """0: 4 1 5501: 2 3 | 3 2512: 4 4 | 5 5523: 4 5 | 5 4534: "a"545: "b"55ababbb56bababa57abbbab58aaabbb59aaaabbb"""60TEST_DATA2 = """42: 9 14 | 10 1619: 14 27 | 1 26...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!