Best Python code snippet using gherkin-python
user_alerts_mon_2013_ref.py
Source:user_alerts_mon_2013_ref.py
1"""2Created on Feb 21, 20133@author: wmechem4This is the finRocket ALERT module. It is code to monitor5incoming stories and process alerts based on settings saved by users.6Alerts may include ENTITIES, TICKERS, contained in the story title or7body as well as SENTIMENT etc. See parse_rules()8Uses ZMQ to listen to incoming messages.9Alerts are sent via EMAIL or SMS based on user's preferences10pylint 7.5 / 10 2016/05/15 - needs refactoring11"""12from datetime import datetime13from email.mime.text import MIMEText14import simplejson as json15import logging16from multiprocessing import Process, Queue17import os18import re19import smtplib20import threading21import time22from flask import Flask23from flask.ext.sqlalchemy import SQLAlchemy24import MySQLDB as mdb25from pandas import DataFrame26from pandas.io import sql27import zmq28#setup ZMQ29UA_CONTEXT = zmq.Context()30USER_ALERTS_PULL_ADDR = 'tcp://127.0.0.1:6040'31print "user_alerts module 2013_09_09_localIP"32HOME_DIR = os.environ['HOME']33print 'Home DIR is ', HOME_DIR34LOAD_ALERT_FREQ = 300 # seconds before checking for changes35DB_HOST = 'xxx5-22.compute-1.amazonaws.com'36ALERT_DB_LIMIT = 1000 # number of alerts to load37ALERT_DB_OFFSET = 038logging.basicConfig(filename=HOME_DIR + "/kj_alerts.log", level=logging.DEBUG)39logging.info("Starting User Alerts Monitor @ " + str(datetime.now()))40MODULE_NAME = "User Alerts Monitor"41VERBOSE = 0 # set VERBOSE != 1 to turn off extra logging42APP = Flask(__name__)43DB = SQLAlchemy(APP)44APP.config['SQLALCHEMY_DATABASE_URI'] = 'mysql://kj_user:pwd@' + (45 DB_HOST + '/kj_feb_2013_01')46USER_ALERTS = ""47class UserProfile(DB.Model):48 """ Access user profiles. """49 def __init__(self):50 pass51 __tablename__ = 'kj_users'52 id_ = DB.Column(DB.Integer, primary_key=True)53 username = DB.Column(DB.String(80))54 userpasswd = DB.Column(DB.String(80))55 user_created_date = DB.Column(DB.Date)56 user_email = DB.Column(DB.String(100))57 user_mobile = DB.Column(DB.String(40))58 user_carrier = DB.Column(DB.String(40))59class UserAlert(DB.Model):60 """ Access user alerts. """61 __tablename__ = 'kj_user_alerts'62 id_ = DB.Column(DB.Integer, primary_key=True)63 username = DB.Column(DB.String(80))64 user_alert_name = DB.Column(DB.String(20))65 user_alert_created_date = DB.Column(DB.Date)66 user_alert_scope = DB.Column(DB.String(80))67 user_alert_condition = DB.Column(DB.String(40))68 user_alert_keywords = DB.Column(DB.String(2000))69 user_alert_actions = DB.Column(DB.String(40))70 user_alert_triggered_state = DB.Column(DB.String(40))71 user_alert_triggered_time = DB.Column(DB.Date)72 user_alert_delet = DB.Column(DB.Integer(1))73 user_alert_ext_op = DB.Column(DB.String(10))74 user_alert_ext_scop = DB.Column(DB.String(80))75 user_alert_ext_conditio = DB.Column(DB.String(40))76 user_alert_ext_keywords = DB.Column(DB.String(2000))77 def __init__(self, id_, username, a_name, a_created_date, a_scope,78 a_condition, a_keywords, a_actions, a_state, a_state_time,79 a_delete, a_ext_op, a_ext_scope, a_ext_condition, a_ext_keywords):80 """ Initialize alert attributes. """81 self.id_ = id_82 self.username = username83 self.user_alert_name = a_name84 self.user_alert_created_date = a_created_date85 self.user_alert_scope = a_scope86 self.user_alert_condition = a_condition87 self.user_alert_keywords = a_keywords88 self.user_alert_actions = a_actions89 self.user_alert_triggered_state = a_state90 self.user_alert_triggered_time = a_state_time91 self.user_alert_delete = a_delete92 self.user_alert_ext_op = a_ext_op93 self.user_alert_ext_scope = a_ext_scope94 self.user_alert_ext_condition = a_ext_condition95 self.user_alert_ext_keywords = a_ext_keywords96def do_log(msg):97 """ Generic logging function. """98 print msg99 if VERBOSE == 1:100 l_string = (MODULE_NAME + " "101 + msg + " " + str(datetime.now()))102 logging.info(l_string)103 else:104 return105 return106def load_user_alerts_df():107 """ Load user alerts. """108 con = mdb.connect(DB_HOST, 'kj_user', 'xxx', 'kj_feb_2013_01')109 dataframe = sql.read_frame("SELECT * FROM kj_user_alerts;", con)110 con.close()111 return dataframe112def load_user_alerts_in_q(user_alerts_rules_q):113 """ Periodically load user alerts in to a queue114 so that we can detect changes.115 """116 while True:117 try:118 # Keep getting queue until empty then replace with rules from db119 rules = user_alerts_rules_q.get_nowait()120 user_alerts_rules_q.put(rules)121 except Exception:122 # if queue is empty load alerts from database123 df = load_user_alerts_df()124 user_alerts_rules_q.put(df)125 time.sleep(LOAD_ALERT_FREQ)126 return127def load_once_user_alerts(limit, offset):128 """ init user alerts once from DB then from q """129 results = UserAlert.query.limit(limit).offset(offset).all()130 json_results = []131 for result in results:132 data = {"id_": result.id_,133 'username': result.username,134 'user_alert_name': result.user_alert_name,135 'user_alert_scope': result.user_alert_scope,136 'user_alert_condition': result.user_alert_condition,137 'user_alert_keywords': result.user_alert_keywords,138 'user_alert_actions': result.user_alert_actions,139 'user_alert_triggered_state': result.user_alert_triggered_state,140 'user_alert_triggered_time': result.user_alert_triggered_time,141 'user_alert_delete': result.user_alert_delete,142 'user_alert_ext_op': result.user_alert_ext_op,143 'user_alert_ext_scope': result.user_alert_ext_scope,144 'user_alert_ext_condition': result.user_alert_ext_condition,145 'user_alert_ext_keywords': result.user_alert_ext_keywords,146 }147 json_results.append(data)148 return json_results149def load_user_profile(username):150 """ get user info from DB """151 result = UserProfile.query.filter_by(username=username).first()152 json_result = {'username': result.username,153 'userpasswd': result.userpasswd,154 'user_created_date': result.user_created_date,155 'user_email': result.user_email,156 'user_mobile': result.user_mobile,157 'user_carrier': result.user_carrier158 }159 return json_result160def get_messages_from_kj_main_t(context, user_alerts_pull_addr,161 in_messages_pool_q):162 """ Listen for messages to parse coming from main process163 and put them in a queue.164 """165 context = context166 alerts_pull_socket = context.socket(zmq.PULL)167 alerts_pull_socket.connect(USER_ALERTS_PULL_ADDR)168 while True:169 if VERBOSE == 1:170 l_string = MODULE_NAME + " waiting for new message " + str(171 datetime.now())172 logging.info(l_string)173 message = alerts_pull_socket.recv_pyobj()174 if VERBOSE == 1:175 l_string = MODULE_NAME + ("putting message in in_messages_q " +176 str(datetime.now()))177 logging.info(l_string)178 in_messages_pool_q.put(message)179 time.sleep(.1)180 return181def send_alert(target, user_alert_name, alert_out_message):182 """main SMTP handler"""183 comma_space = ', '184 dist_list = [target]185 smtpuser = 'msgs@wjtglobal.com' # for SMTP AUTH, set SMTP username here186 smtppass = 'xxx' # for SMTP AUTH, set SMTP password here187 msg = MIMEText(alert_out_message)188 msg['Subject'] = user_alert_name189 msg['From'] = 'alerts@finrocket.com'190 msg['To'] = comma_space.join(dist_list)191 mailServer = smtplib.SMTP('smtp.1and1.com', 587)192 mailServer.ehlo()193 mailServer.starttls()194 mailServer.ehlo()195 mailServer.login(smtpuser, smtppass)196 mailServer.sendmail(smtpuser, dist_list, msg.as_string())197 mailServer.close()198 return199def parse_rules(user_alerts_rules_q, new_rules_q):200 """ Get rules from q and parse them. Create dictionary objects to201 quickly test for existence of a rule when messgaes come in.202 """203 sent_dict = {}204 sent_keys = [-2, -1.75, -1.5, -1.25, 0, 1.25, 1.5, 1.75, 2]205 for key in sent_keys:206 sent_dict[key] = []207 title_dict = {}208 title_not_dict = {}209 any_dict = {}210 any_not_dict = {}211 entities_dict = {}212 entities_not_dict = {}213 tickers_dict = {}214 tickers_not_dict = {}215 rules = user_alerts_rules_q.get()216 do_log("Got rules DF from user_alserts_rules_q" + str(rules))217 if VERBOSE == 1:218 l_string = MODULE_NAME + " loaded user alert rules " + (219 str(datetime.now()))220 logging.info(l_string)221 else:222 pass223 print rules224 for i, row in enumerate(rules.values):225 do_log("Row is " + str(i))226 do_log("Number of rules is " + str(len(rules.values)))227 id_, username, user_alert_name, user_alert_created_date, \228 user_alert_scope, user_alert_condition, user_alert_keywords, \229 user_alert_actions, user_alert_triggered_state, \230 user_alert_triggered_time, user_alert_delete, \231 user_alert_ext_op, user_alert_ext_scope, user_alert_ext_condition,\232 user_alert_ext_keywords = row233 print str(id_) + " " + user_alert_name + (234 " " + user_alert_scope + " " +235 user_alert_condition + " " + user_alert_keywords)236 user_alert_scope = user_alert_scope.upper()237 user_alert_condition = user_alert_condition.upper()238 rule_keywords = user_alert_keywords.split(' ')239 rule_keywords = set(rule_keywords)240 if user_alert_scope == 'TITLE' and (241 user_alert_condition == 'CONTAINS'):242 for key in rule_keywords:243 key = key.upper()244 if title_dict.get[key]:245 title_dict[key].append(id_)246 else:247 title_dict[key] = [id_]248 if user_alert_scope == 'TITLE' and (249 user_alert_condition == 'DOES NOT CONTAIN'):250 for key in rule_keywords:251 key = key.upper()252 if title_not_dict.get(key):253 title_not_dict[key].append(id_)254 else:255 title_not_dict[key] = [id_]256 if user_alert_scope == 'ANY' and (257 user_alert_condition == 'CONTAINS'):258 for key in rule_keywords:259 key = key.upper()260 if any_dict.get(key):261 any_dict[key].append(id_)262 else:263 any_dict[key] = [id_]264 if user_alert_scope == 'ANY' and (265 user_alert_condition == 'DOES NOT CONTAIN'):266 for key in rule_keywords:267 key = key.upper()268 if any_not_dict.get(key):269 any_not_dict[key].append(id_)270 else:271 any_not_dict[key] = [id_]272 if user_alert_scope == 'TICKER' and (273 user_alert_condition == 'CONTAINS'):274 for key in rule_keywords:275 key = key.upper()276 if tickers_dict.get(key):277 tickers_dict[key].append(id_)278 else:279 tickers_dict[key] = [id_]280 if user_alert_scope == 'TICKER' and (281 user_alert_condition == 'DOES NOT CONTAIN'):282 for key in rule_keywords:283 key = key.upper()284 if tickers_not_dict.get(key):285 tickers_not_dict[key].append(id_)286 else:287 tickers_not_dict[key] = [id_]288 if user_alert_scope == 'ENTITIES' and (289 user_alert_condition == 'CONTAINS'):290 for key in rule_keywords:291 key = key.upper()292 if entities_dict.get(key):293 entities_dict[key].append(id_)294 else:295 entities_dict[key] = [id_]296 if user_alert_scope == 'ENTITIES' and (297 user_alert_condition == 'DOES NOT CONTAIN'):298 for key in rule_keywords:299 key = key.upper()300 if entities_not_dict.get(key):301 entities_not_dict[key].append(id_)302 else:303 entities_not_dict[key] = [id_]304 if user_alert_scope == 'SENTIMENT':305 do_log("Processing SENTIMENT dictionary")306 do_log("Sentiment Alert processed for id_ " +307 str(id_) + " " + user_alert_scope + " " +308 user_alert_condition + " " + str(user_alert_keywords))309 print user_alert_scope + " " + user_alert_condition310 if user_alert_scope == 'SENTIMENT' and user_alert_condition == '=':311 print user_alert_scope + " " + user_alert_condition312 user_alert_keywords = float(user_alert_keywords)313 for key in sent_dict.keys():314 if float(user_alert_keywords) == float(key):315 print "Matched =", float(key), float(user_alert_keywords)316 if sent_dict.get(key):317 sent_dict[key].append(str(id_))318 print sent_dict[key]319 print str(id_) + (320 "appending sent_dict for key " + str(key))321 else:322 #sent_dict[key = ser_alert_name323 sent_dict[key] = [str(id_)]324 print sent_dict[key]325 print str(id_) + (326 "creating entry in sent_dict for key " + str(key))327 else:328 pass329 if user_alert_scope == 'SENTIMENT' and user_alert_condition == '>':330 print user_alert_scope + " " + user_alert_condition331 print "Keyword " + user_alert_keywords332 user_alert_keywords = float(user_alert_keywords)333 for key in sent_dict.keys():334 print "Sentiment Key: " + str(key)335 if float(user_alert_keywords) < float(key):336 print "Matched >", float(key), float(user_alert_keywords)337 if sent_dict.get(key):338 sent_dict[key].append(str(id_))339 print sent_dict[key]340 print str(id_) + (341 "appending sent_dict for key " + str(key))342 else:343 #sent_dict[key = ser_alert_name344 sent_dict[key] = [str(id_)]345 print sent_dict[key]346 print str(id_) + (347 "creating entry in sent_dict for key " + str(key))348 else:349 pass350 if user_alert_scope == 'SENTIMENT' and user_alert_condition == '<':351 print user_alert_scope + " " + user_alert_condition352 user_alert_keywords = float(user_alert_keywords)353 for key in sent_dict.keys():354 if float(user_alert_keywords) > float(key):355 print "Matched < ", float(key), float(user_alert_keywords)356 if sent_dict.get(key):357 sent_dict[key].append(str(id_))358 print sent_dict[key]359 print str(id_) + "appending sent_dict for key " + (360 str(key))361 else:362 sent_dict[key] = str(id_)363 print sent_dict[key]364 print str(id_) + (365 "creating entry in sent_dict for key " + str(key))366 else:367 pass368 if user_alert_scope == 'SENTIMENT' and user_alert_condition == '=':369 print user_alert_scope + " " + user_alert_condition370 user_alert_keywords = float(user_alert_keywords)371 for key in sent_dict.keys():372 if float(user_alert_keywords) >= float(key):373 print "Matched > ", float(key), float(user_alert_keywords)374 if sent_dict.get(key):375 sent_dict[key].append(str(id_))376 print sent_dict[key]377 print str(id_) + "appending sent_dict for key " + (378 str(key))379 else:380 #sent_dict[key = user_alert_name381 sent_dict[key] = [str(id_)]382 print sent_dict[key]383 print str(id_) + (384 "creating entry in sent_dict for key " + str(key))385 else:386 pass387 if user_alert_scope == 'SENTIMENT' and user_alert_condition == '=':388 print user_alert_scope + " " + user_alert_condition389 for key in sent_dict.keys():390 if float(user_alert_keywords) <= float(key):391 print "Matched >", float(key), float(user_alert_keywords)392 if sent_dict.get(key):393 sent_dict[key].append(str(id_))394 print sent_dict[key]395 print str(id_) + " appending sent_dict for key " + (396 str(key))397 else:398 sent_dict[key] = [str(id_)]399 print sent_dict[key]400 print str(id_) + (401 "creating entry in sent_dict for key " + str(key))402 else:403 pass404 print "Parsed all rules into dictionaries"405 print "Title Keys:" + str(title_dict.keys())406 print "Title Not Keys:" + str(title_not_dict.keys())407 print "Any Keys:" + str(any_dict.keys())408 print "Any Not Keys:" + str(any_not_dict.keys())409 print "Entities Keys:" + str(entities_dict.keys())410 print "Entities Not Keys:" + str(entities_not_dict.keys())411 print "Tickers Keys:" + str(tickers_dict.keys())412 print "Tickers Not Keys:" + str(tickers_not_dict.keys())413 if VERBOSE == 1:414 l_string = MODULE_NAME + " Parsed all rules into dictionaries " + (415 str(datetime.now()))416 logging.info(l_string)417 else:418 pass419 out_list = [rules, sent_dict, title_dict, any_dict,420 entities_dict, tickers_dict, any_not_dict,421 title_not_dict, entities_not_dict, tickers_not_dict]422 new_rules_q.put(out_list)423 return out_list424def get_new_rules(new_rules_q):425 """ Check queue for new rules. """426 print "Getting new rules"427 if VERBOSE == 1:428 l_string = MODULE_NAME + " Getting new rules " + (429 str(datetime.now()))430 logging.info(l_string)431 else:432 pass433 try:434 # get rules from queue if they exist435 new_rules = (rules, sent_dict, title_dict,436 any_dict, entities_dict, tickers_dict,437 any_not_dict, title_not_dict, entities_not_dict,438 tickers_not_dict) = new_rules_q.get_nowait()439 except Exception(Queue.Empty):440 print "No new rules to get"441 return new_rules442def get_new_message(in_messages_pool_q):443 """ Loop on message queue.get """444 while True:445 try:446 #see if we have a new message447 message = in_messages_pool_q.get()448 print "Message ", len(message)449 if VERBOSE == 1:450 l_string = MODULE_NAME + "Message length " + (451 str(len(message)) + " " + str(datetime.now()))452 logging.info(l_string)453 else:454 pass455 yield message456 pass457 except Queue.Empty:458 time.sleep(.1)459 pass460def parse_message(message):461 """ Parse fields from dict object received from main KJ process"""462 message = json.loads(message)463 m_title = message['m_title']464 m_description = message['m_description']465 m_sentiment = message['m_sentiment']466 m_tickers = message['m_tickers']467 m_entities = message['m_entities']468 m_link = message['m_link']469 alert_out_message = m_title[0:20]+" S"+m_sentiment+" "+m_link470 return (m_title, m_description, m_sentiment, m_tickers,471 m_entities, m_link), alert_out_message472def process_sent_dict(alert_dict, sent_dict, m_sentiment, alerts_fired):473 """ Check to see if sentiment alert is triggered. """474 alert_dict = sent_dict475 log_msg = "Processing " + str(alert_dict)476 do_log(log_msg)477 if m_sentiment:478 key = float(m_sentiment)479 alerts_fired.append(alert_dict[key])480 log_msg = "Alerts fired contains a sentiment alert " + (481 str(alerts_fired))482 do_log(log_msg)483 return alerts_fired484def get_any_tokens(m_title, m_description):485 """ Make tokens out of title and description text. """486 message = nltk.clean_html(m_description)487 log_msg = "Message len after m_description html clean:" + (488 str(len(message)))489 do_log(log_msg)490 if nltk.clean_html(m_title):491 m_title = nltk.clean_html(m_title)492 message = message + " " + m_title493 log_msg = "Message has length after title html clean:" + (494 str(len(message)))495 do_log(log_msg)496 else:497 log_msg = (498 "Error processing m_title with nltl.clean_html ")499 do_log(log_msg)500 message = nltk.word_tokenize(message)501 punctuation = re.compile(r'[-.?!,&":;()|0-9]')502 tokens = [punctuation.sub(" ", token) for token in message]503 log_msg = str(tokens)504 do_log(log_msg)505 return tokens506def process_any_dict(alert_dict, any_dict, m_title, m_description,507 alerts_fired):508 """ If title or description contain matching text alert is509 triggered.510 """511 log_msg = "Processing matches for ANY CONTAINS"512 do_log(log_msg)513 print log_msg514 alert_dict = any_dict515 log_msg = " processing " + str(alert_dict)516 do_log(log_msg)517 tokens = get_any_tokens(m_title, m_description)518 for token in set(tokens):519 token = token.upper()520 log_msg = "Looking for: " + token521 do_log(log_msg)522 if alert_dict.get(token):523 alerts_fired.append(alert_dict[token])524 log_msg = "Added " + str(alert_dict[token]) + (525 " to alerts_fired")526 do_log(log_msg)527 else:528 log_msg = "Token not found in alert_dict: " + token529 do_log(log_msg)530 return alerts_fired531def process_any_not_dict(alert_dict, any_not_dict, m_title, m_description,532 alerts_fired):533 """ If title or description contain matching (NOT) text alert534 is triggered.535 """536 log_msg = "Processing matches for ANY DOES NOT CONTAIN"537 do_log(log_msg)538 alert_dict = any_not_dict539 log_msg = "processing " + str(alert_dict)540 do_log(log_msg)541 tokens = get_any_tokens(m_title, m_description)542 count = 0543 for token in set(tokens):544 token = token.upper()545 log_msg = "Looking for: " + token546 do_log(log_msg)547 if alert_dict.get(token):548 count += 1549 else:550 pass551 if count == 0:552 for key in alert_dict.keys():553 alerts_fired.append(alert_dict[key])554 return alerts_fired555def get_title_tokens(m_title):556 """ Make tokens out of title text, """557 message = nltk.clean_html(m_title)558 message = nltk.word_tokenize(message)559 punctuation = re.compile(r'[-.?!,&":;()|0-9]')560 tokens = [punctuation.sub(" ", token) for token in message]561 return tokens562def process_title_dict(alert_dict, title_dict, m_title, alerts_fired):563 """ If title contains matching text alert is triggered. """564 log_msg = "Processing TITLE matches for CONTAINS"565 do_log(log_msg)566 alert_dict = title_dict567 log_msg = "processing " + str(alert_dict)568 do_log(log_msg)569 tokens = get_title_tokens(m_title)570 for token in tokens:571 token = token.upper()572 log_msg = "Looking for: " + token573 do_log(log_msg)574 if title_dict.get(token):575 alerts_fired.append(title_dict[token])576 else:577 pass578 return alerts_fired579def process_title_not_dict(alert_dict, title_not_dict, m_title, alerts_fired):580 """ If title does not contain matching text alert is triggered. """581 log_msg = "Processing TITLE matches for DOES NOT CONTAIN"582 do_log(log_msg)583 alert_dict = title_not_dict584 log_msg = "processing " + str(alert_dict)585 do_log(log_msg)586 tokens = get_title_tokens(m_title)587 count = 0588 for token in tokens:589 token = token.upper()590 log_msg = "Looking for: " + token591 do_log(log_msg)592 if alert_dict.get(token):593 count += 1594 else:595 pass596 if count == 0:597 for key in alert_dict.keys():598 alerts_fired.append(alert_dict[key])599 return alerts_fired600def process_tickers_dict(alert_dict, tickers_dict, m_tickers, alerts_fired):601 """ If tickers contains matching symbol alert is triggered. """602 log_msg = "Processing TICKERS matches for CONTAINS"603 do_log(log_msg)604 alert_dict = tickers_dict605 log_msg = "processing " + str(alert_dict)606 do_log(log_msg)607 for token in set(m_tickers.split(',')):608 token = token.upper()609 log_msg = "Looking for: " + token610 do_log(log_msg)611 if alert_dict.get(token):612 alerts_fired.append(alert_dict[token])613 else:614 pass615 return alerts_fired616def process_tickers_not_dict(alert_dict, tickers_not_dict, m_tickers,617 alerts_fired):618 """ If tickers does not contain matching symbol alert is triggered. """619 alert_dict = tickers_not_dict620 log_msg = "processing " + str(alert_dict)621 do_log(log_msg)622 log_msg = "Processing TICKERS matches for DOES NOT CONTAIN"623 do_log(log_msg)624 for token in set(m_tickers.split(',')):625 token = token.upper()626 log_msg = "Looking for: " + token627 do_log(log_msg)628 count = 0629 if alert_dict.get(token):630 count += 1631 else:632 pass633 if count == 0:634 for key in alert_dict.keys():635 alerts_fired.append(alert_dict[key])636 return alerts_fired637def process_entities_dict(alert_dict, entities_dict, m_entities, alerts_fired):638 """ If entities contains matching name alert is triggered. """639 log_msg = "Processing ENTITIES matches for CONTAINS"640 do_log(log_msg)641 alert_dict = entities_dict642 for token in set(m_entities.split(',')):643 token = token.upper()644 log_msg = "Looking for: " + token645 do_log(log_msg)646 if alert_dict.get(token):647 alerts_fired.append(alert_dict[token])648 else:649 pass650 return alerts_fired651def process_entities_not_dict(alert_dict, entities_not_dict, m_entities,652 alerts_fired):653 """ If entities does contain matching name alert is triggered. """654 alert_dict = entities_not_dict655 log_msg = "processing " + str(alert_dict)656 do_log(log_msg)657 log_msg = "Processing ENTITIES matches for DOES NOT CONTAIN"658 do_log(log_msg)659 count = 0660 for token in set(m_entities.split(',')):661 token = token.upper()662 log_msg = "Looking for: " + token663 do_log(log_msg)664 if alert_dict.get(message[token]):665 count += 1666 else:667 pass668 if count == 0:669 for key in alert_dict.keys():670 alerts_fired.append(entities_not_dict[key])671 return alerts_fired672def process_alerts_fired(alerts_fired, rules, alert_out_message):673 """ Process alerts_fired list. """674 log_msg = "Beginning processing of alerts_fired" + (675 str(alerts_fired))676 do_log(log_msg)677 alerts_list = []678 for id_ in alerts_fired:679 for i in id_:680 do_log("alerts_fired contains " + str(alerts_fired))681 do_log("alerts_fired id_[0] = " + str(i))682 alerts_list.append(i)683 do_log("Added " + str(i) + " to alerts_list -> " + (684 str(alerts_list)))685 alerts_list = set(alerts_list)686 do_log("alerts_list contains: " + str(alerts_list))687 print "alerts_list is:" + str(alerts_list)688 try:689 for alert_ in alerts_list:690 do_log("Alert is type " + str(type(alert_)))691 log_msg = "Processing profile for alert " + str(alert_)692 do_log(log_msg)693 do_log(str(rules))694 rule_df = DataFrame()695 rule_df = rules[rules['id_'].isin([int(alert_), ])]696 do_log("rule_df contains: " + str(rule_df))697 if rule_df:698 for i, row in enumerate(699 rule_df['username'].values):700 username = row701 username = str(username)702 do_log(username)703 for i, row in enumerate(704 rule_df['user_alert_name'].values):705 user_alert_name = row706 user_alert_name = str(user_alert_name)707 do_log("Found " + user_alert_name)708 try:709 #get user profile to determine alert actions710 user_profile = load_user_profile(username)711 if user_alert_actions == 'TEXT':712 #just send an sms713 target = user_profile['user_mobile'] + "@" + (714 user_profile['user_carrier'])715 send_alert(target, user_alert_name,716 alert_out_message)717 log_msg = "Sending TEXT to " + str(target)718 do_log(log_msg)719 send_alert(target, user_alert_name,720 alert_out_message)721 if user_alert_actions == 'TEXT & EMAIL':722 #send both sms and email723 target = (str(user_profile['user_mobile'])724 + "@"725 + str(user_profile['user_carrier'])726 + ".com")727 log_msg = ("Sending TEXT and EMAIL to "728 + str(target))729 do_log(log_msg)730 send_alert(target, user_alert_name,731 alert_out_message)732 target = str(user_profile['user_email'])733 log_msg = "Sending EMAIL to " + str(target)734 do_log(log_msg)735 send_alert(target, user_alert_name,736 alert_out_message)737 if user_alert_actions == 'EMAIL':738 log_msg = "Sending EMAIL to " + str(target)739 do_log(log_msg)740 target = str(user_profile['user_email'])741 send_alert(target, user_alert_name,742 alert_out_message)743 except Exception as error:744 do_log(str(error) + " in Sending function for " + (745 str(username) + " alert: " + str(alert_)))746 except Exception as error:747 do_log(str(error) + " in processing profile " + (748 str(username) + " alert: " + str(alert_)))749 pass750def process_message_p(in_messages_pool_q, out_messages_q, new_rules_q):751 """ Main function. Load alerts from queue then filter incoming752 messages with alert key words and conditions """753 while True:754 alerts_fired = []755 rules, sent_dict, title_dict, any_dict, \756 entities_dict, tickers_dict, any_not_dict, \757 title_not_dict, entities_not_dict, \758 tickers_not_dict = get_new_rules(new_rules_q)759 message, alert_out_message = get_new_message(in_messages_pool_q)760 alert_dict = {}761 m_title, m_description, m_sentiment, m_tickers, \762 m_entities, m_link = parse_message(message)763 alerts_fired = process_sent_dict(alert_dict, sent_dict,764 m_sentiment, alerts_fired)765 alerts_fired = process_any_dict(alert_dict, any_dict,766 m_title, m_description, alerts_fired)767 alerts_fired = process_any_not_dict(alert_dict, any_not_dict,768 m_title, m_description, alerts_fired)769 alerts_fired = process_title_dict(alert_dict, title_dict, m_title,770 alerts_fired)771 alerts_fired = process_title_not_dict(alert_dict, title_not_dict,772 m_title, alerts_fired)773 alerts_fired = process_tickers_dict(alert_dict, tickers_dict,774 m_tickers, alerts_fired)775 alerts_fired = process_tickers_not_dict(alert_dict, tickers_not_dict,776 m_tickers, alerts_fired)777 alerts_fired = process_entities_dict(alert_dict, entities_dict,778 m_entities, alerts_fired)779 alerts_fired = process_entities_not_dict(alert_dict,780 entities_not_dict, m_entities, alerts_fired)781 process_alerts_fired(alerts_fired, rules, alert_out_message)782def start_module():783 """ Setup queues and start threads and processes """784 in_messages_pool_q = Queue()785 out_messages_q = Queue()786 user_alerts_rules_q = Queue()787 new_rules_q = Queue()788 for i in range(0, 1):789 get_messages_t = threading.Thread(target=get_messages_from_kj_main_t,790 args=(UA_CONTEXT, USER_ALERTS_PULL_ADDR, in_messages_pool_q))791 get_messages_t.setDaemon(False)792 get_messages_t.start()793 for i in range(0, 1):794 load_user_alerts_t = threading.Thread(target=load_user_alerts_in_q,795 args=(user_alerts_rules_q))796 load_user_alerts_t.setDaemon(False)797 load_user_alerts_t.start()798 for i in range(0, 1):799 proc_rules_p = Process(target=parse_rules,800 args=(user_alerts_rules_q, new_rules_q,))801 proc_rules_p.start()802 for i in range(0, 1):803 proc_messages_p = Process(target=process_message_p,804 args=(in_messages_pool_q, out_messages_q, new_rules_q,))805 proc_messages_p.start()...
grammar_test.py
Source:grammar_test.py
...489 rule_positional('ABC', 123, '=', '+')490 =491 'a'492 ;493 rule_keywords(k1=ABC, k3='=', k4='+', k2=123)494 =495 'b'496 ;497 rule_all('DEF', 456, '=', '+', k1=HIJ, k3='=', k4='+', k2=789)498 =499 'c'500 ;501 '''502 pretty = '''503 start504 =505 {rule_positional | rule_keywords | rule_all} $506 ;507 rule_positional(ABC, 123, '=', '+')508 =509 'a'510 ;511 rule_keywords(k1=ABC, k3='=', k4='+', k2=123)512 =513 'b'514 ;515 rule_all(DEF, 456, '=', '+', k1=HIJ, k3='=', k4='+', k2=789)516 =517 'c'518 ;519 '''520 model = genmodel('RuleArguments', grammar)521 self.assertEqual(trim(pretty), ustr(model))522 model = genmodel('RuleArguments', pretty)523 ast = model.parse("a b c")524 self.assertEqual(['a', 'b', 'c'], ast)525 semantics = TC36Semantics()526 ast = model.parse("a b c", semantics=semantics)527 self.assertEqual(['a', 'b', 'c'], ast)528 codegen(model)529 def test_36_unichars(self):530 grammar = '''531 start = { rule_positional | rule_keywords | rule_all }* $ ;532 rule_positional("ÃÃÃäöüÃ") = 'a' ;533 rule_keywords(k1='äöüÃÃÃÃ') = 'b' ;534 rule_all('ÃÃÃÃäöü', k1="ÃäöüÃÃÃ") = 'c' ;535 '''536 def _trydelete(pymodule):537 import os538 try:539 os.unlink(pymodule + ".py")540 except EnvironmentError:541 pass542 try:543 os.unlink(pymodule + ".pyc")544 except EnvironmentError:545 pass546 try:547 os.unlink(pymodule + ".pyo")...
dataloader.py
Source:dataloader.py
1# Imports2import torch3import numpy as np4import logging5import pickle6import os7import pytorch_lightning as pl8# Submodules9from typing import Union, List10from tqdm import tqdm, trange11from torch.utils.data import Dataset, TensorDataset12from snorkel.labeling import LFApplier13from snorkel_utils import make_keyword_lf14# Need to set tokenizers_parallelism environment variable to avoid lots of warnings15os.environ["TOKENIZERS_PARALLELISM"] = "false"16# Set up logging17logger = logging.getLogger('__file__')18# Collate function for RPNDataset19class RPNCollate():20 def __init__(self, tokenizer):21 # self.id2word = id2word22 self.tokenizer = tokenizer23 def __call__(self, batch):24 '''25 Collate function to turn batch from dataloader into clean dict of output26 '''27 # print(batch)28 # print("Length", len(batch))29 # seq, attn_mask, labels, noisy_labels, noised_ids, mlm_labels, starts, ends = *batch30 input_ids = torch.stack(tuple([x['input_ids'] for x in batch]))31 attn_mask = torch.stack(tuple([x['attention_masks'] for x in batch]))32 labels = torch.stack(tuple([x['labels'] for x in batch]))33 noisy_labels = torch.stack(tuple([x['noisy_labels'] for x in batch]))34 soft_labels = torch.stack(tuple([x['soft_labels'] for x in batch]))35 noised_ids = torch.stack(tuple([x['noised_ids'] for x in batch]))36 mlm_labels = torch.stack(tuple([x['mlm_labels'] for x in batch]))37 starts = [x['word_starts'] for x in batch]38 ends = [x['word_ends'] for x in batch]39 # Get batch indices and start/end indices of each word40 batch_inds = torch.cat(tuple([i*torch.ones_like(s).long() for i, s in enumerate(starts)])).reshape(-1,1)41 starts = torch.cat(tuple(starts)).reshape(-1,1)42 ends = torch.cat(tuple(ends)).reshape(-1,1)43 # Get tensor to select ids and/or embeddings for each word from a tensor44 word_lengths = ends-starts45 max_len = word_lengths.max()46 selector_inds = starts + torch.arange(max_len)47 selector_mask = (selector_inds < ends)48 selector_inds[~selector_mask] = 049 # Get all words in the batch to be used for creating phrase-based rules50 batch_words = reconstruct_words(input_ids, starts, ends, self.tokenizer, batch_inds=batch_inds)51 output_dict = {52 'input_ids': input_ids, 53 'attention_masks': attn_mask, 54 'labels': labels, 55 'noisy_labels': noisy_labels, 56 'noised_ids': noised_ids, 57 'mlm_labels': mlm_labels,58 'batch_inds': batch_inds,59 'word_starts':starts, 60 'word_ends': ends,61 'word_inds': selector_inds,62 'word_mask': selector_mask,63 'batch_words': batch_words,64 'soft_labels': soft_labels65 }66 return output_dict67# Helper functions68# def reconstruct_words(input_ids, starts, ends, id2word, batch_inds=None):69def reconstruct_words(input_ids, starts, ends, tokenizer, batch_inds=None):70 '''71 Reconstruct all words in text from their input ids72 '''73 words = []74 ss = starts.flatten()75 es = ends.flatten()76 if batch_inds is not None:77 bs = batch_inds.flatten()78 words = [tokenizer.decode(input_ids[b, s:e]) for b, s, e in zip(bs, ss, es)]79 # for (b, s, e) in zip(bs, ss, es):80 # if s - e == 1:81 # words.append[id2word[input_ids[b, s:e].item()]]82 # else:83 # subword_ids = input_ids[b, s:e].numpy()84 # words.append(tokenizer.decode(subword_ids))85 # words.append(merge_tokens(subword_ids, id2word))86 else:87 words = [tokenizer.decode(input_ids[s:e]) for s, e in zip(ss, es)]88 # for (s, e) in zip(ss, es):89 # if s - e == 1:90 # words.append[id2word[input_ids[s:e].item()]]91 # else:92 # subword_ids = input_ids[s:e].numpy()93 # words.append(tokenizer.decode(subword_ids))94 # words.append(merge_tokens(subword_ids, id2word))95 return words96# def merge_tokens(subword_ids, id2word):97# '''98# Merge tokens from subword units99# '''100 # tokens = [id2word[i] for i in subword_ids]101 # s = tokens[0]102 # for t in tokens[1:]:103 # if t.startswith('##'):104 # s += t[2:]105 # else:106 # s += ' ' + t107 # return s108def get_word_spans(word_ids, punct_inds=None):109 '''110 Get spans of whole words list of wordpiece -> word mappings111 Params:112 -------113 word_ids: List114 List of which word is mapped to each individual token115 116 Returns:117 --------118 span_starts: torch.LongTensor119 Array of starts of word spans120 span_ends: torch.LongTensor121 Array of ends of word spans122 Example:123 --------124 Sentence: "the dog jumped excitedly"125 -> Tokenized: ['[CLS]', 'the','dog', 'jump', '##ed', 'excit', '##ed', '##ly', '[SEP]']126 -> word_ids: [None, 0, 1, 2, 2, 3, 3, 3, None]127 -> Spans: [(0,0), (1,2), (2,3), (3,5), (5,8), (0,0)]128 Usage: self.get_word_spans(word_ids) #word_ids as above129 -> returns: (tensor([1, 2, 3, 5]), tensor([2, 3, 5, 8]))130 131 '''132 prev_ind = None133 starts = []134 ends = []135 # Gather start and end indices136 for i, ind in enumerate(word_ids):137 if prev_ind != ind:138 if prev_ind != None:139 ends.append(i)140 if ind != None:141 starts.append(i)142 prev_ind = ind143 # Return tensors144 return (torch.LongTensor(starts), torch.LongTensor(ends))145def prep_data(text, tokenizer, max_length=128):146 '''147 Prep data for RPN usage148 '''149 enc = tokenizer(text, max_length=max_length, padding=True, truncation=True, return_tensors='pt', return_token_type_ids=False)150 # Portion out different vaues151 encoded_text = enc['input_ids']152 attention_masks = enc['attention_mask']153 # Get word start/end indices154 word_spans = [get_word_spans(enc.word_ids(i)) for i in trange(len(text))]155 word_starts = [s[0] for s in word_spans]156 word_ends = [s[1] for s in word_spans]157 return encoded_text, attention_masks, word_starts, word_ends158class RPNDataset(Dataset):159 # RPN Dataset to mask keywords used in rules160 def __init__(self, 161 data, 162 tokenizer,163 rule_keywords, 164 rule_tokens=[],165 mask_prob=.1, 166 rule_mask_prob=.5,167 seed_labels=None, 168 filter_labels=True,169 max_length=128, 170 min_lf=1,171 ):172 self.text = data['text']173 self.tokenizer = tokenizer174 if 'rule_keywords' in data:175 self.rule_keywords = data['rule_keywords']176 else:177 self.rule_keywords = rule_keywords178 # Tokenizer attributes179 self.word2id = tokenizer.vocab180 self.mask_id = self.word2id['[MASK]']181 self.id2word = {v:k for k, v in self.word2id.items()}182 self.max_length = max_length183 # Make sure data is ready for deep learning models184 if 'encoded_text' not in data.keys():185 self.prepare_data()186 else:187 self.encoded_text = data['encoded_text']188 self.attention_masks = data['attention_masks']189 self.word_starts = data['word_starts']190 self.word_ends = data['word_ends']191 self.labels = data['labels']192 if 'word_lists' in data.keys():193 self.word_lists = data['word_lists']194 else:195 logger.info("Computing word lists")196 self.word_lists = [reconstruct_words(ids, starts, ends, self.tokenizer) 197 for (ids, starts, ends) in tqdm(zip(self.encoded_text, 198 self.word_starts, 199 self.word_ends))]200 # Make sure noisy labels are there201 self.min_lf = min_lf202 if 'noisy_labels' not in data:203 self.make_lfs(rpn_generated=False)204 self.make_noisy_labels()205 else:206 self.noisy_labels = data['noisy_labels']207 self.balance_noisy_labels()208 if 'soft_labels' in data:209 self.soft_labels = data['soft_labels']210 else:211 soft_labels = None212 # self.soft_labels = data['soft_labels']213 214 215 # labeled_inds = ((self.noisy_labels >= 0).sum(dim=1) >= min_lf).nonzero().flatten()216 # logger.debug(labeled_inds.size)217 # logger.debug(f'Proportion labeled: {labeled_inds.size(0)/self.noisy_labels.size(0)}')218 # self.labeled_inds = labeled_inds219 220 # Get vocab size221 self.vocab_size = int(np.max(list(self.word2id.values())) + 1)222 self.num_special_tokens = int(np.max([val for key, val in self.word2id.items() if key.startswith('[')]) + 1)223 # Rule attributes224 self.rule_tokens = rule_tokens225 self.rule_map = {val:val for val in self.word2id.values()}226 self.update_rule_map(rule_tokens)227 self.is_rule = {val:0 for val in self.word2id.values()}228 for w in rule_tokens:229 if w.strip() in self.word2id:230 self.is_rule[self.word2id[w.strip()]] = 1231 232 # Misc attributes233 self.p = mask_prob234 self.rule_p = rule_mask_prob235 self.length = len(self.text)236 self.idx_map = {i:i for i in range(self.length)}237 def prepare_data(self,):238 '''239 Prepare data by tokenizing, padding, and getting word start/end indices240 Params:241 -------242 text: List[str]243 List of text of each instance244 '''245 # Encode text246 enc = self.tokenizer(self.text, max_length=self.max_length, padding=True, truncation=True, return_tensors='pt', return_token_type_ids=False)247 # Portion out different vaues248 self.encoded_text = enc['input_ids']249 self.attention_masks = enc['attention_mask']250 # Get word start/end indices251 word_spans = [get_word_spans(enc.word_ids(i)) for i in trange(len(self.text))]252 self.word_starts = [s[0] for s in word_spans]253 self.word_ends = [s[1] for s in word_spans]254 # Make more general to apply to n-grams/phrases255 def make_lfs(self, rpn_generated=True):256 '''257 Make labeling functions from keywords/phrases258 '''259 self.keyword_lfs = [make_keyword_lf(w, label, rpn_generated=rpn_generated) for label, words in self.rule_keywords.items() for w in words if not ' ' in w]260 self.phrase_lfs = [make_keyword_lf(w, label, rpn_generated=rpn_generated) for label, words in self.rule_keywords.items() for w in words if ' ' in w]261 262 def make_noisy_labels(self):263 '''264 Make noisy labels from labeling functions265 '''266 if len(self.keyword_lfs) > 0:267 keyword_applier = LFApplier(lfs=self.keyword_lfs)268 keyword_noisy_labels = torch.LongTensor(keyword_applier.apply(self.word_lists))269 noisy_labels = keyword_noisy_labels270 if len(self.phrase_lfs) > 0:271 phrase_applier = LFApplier(lfs=self.phrase_lfs)272 phrase_noisy_labels = torch.LongTensor(phrase_applier.apply(self.text))273 noisy_labels = phrase_noisy_labels274 if len(self.keyword_lfs) > 0 and len(self.phrase_lfs) > 0:275 noisy_labels = torch.cat((keyword_noisy_labels, phrase_noisy_labels), dim=1)276 self.full_noisy_labels = noisy_labels277 def balance_noisy_labels(self):278 '''279 Balance number of noisy labels for each class to prevent model imbalance280 '''281 self.noisy_labels = self.full_noisy_labels.clone()282 label_counts = [(self.noisy_labels == label).sum().item() for label in self.rule_keywords.keys()]283 logger.debug(f"Old label counts: {label_counts}") 284 # Balance classes285 count_min = min(label_counts)286 for label in self.rule_keywords.keys():287 count = (self.noisy_labels == label).sum()288 cutoff = (count - count_min)/count289 mask = (torch.rand(self.noisy_labels.size()) < cutoff) & (self.noisy_labels == label)290 self.noisy_labels[mask] = -1291 label_counts = [(self.noisy_labels == label).sum() for label in self.rule_keywords.keys()]292 logger.debug(f"New label counts: {label_counts}") 293 labeled_inds = ((self.noisy_labels >= 0).sum(dim=1) >= self.min_lf).nonzero().flatten()294 # logger.debug(labeled_inds.size)295 logger.debug(f'Proportion labeled: {labeled_inds.size(0)/self.noisy_labels.size(0)}')296 self.labeled_inds = labeled_inds297 298 def _use_labeled(self):299 '''300 Switches model to only iterate through labeled data301 '''302 labeled_inds = ((self.noisy_labels >= 0).sum(dim=1) >= self.min_lf).nonzero().flatten()303 self.labeled_inds = labeled_inds304 self.length = self.labeled_inds.size(0)305 self.idx_map = {i:self.labeled_inds[i] for i in range(self.length)}306 # Debugging statements307 # logger.debug(labeled_inds.size)308 logger.debug(f'Proportion labeled: {labeled_inds.size(0)/self.noisy_labels.size(0)}')309 310 # return noisy_labels311 # def precompute_phrase_counts(self):312 # '''313 # Precompute word counts for faster model training314 # '''315 # phrase_counts = defaultdict(int)316 # phrase_inds = defaultdict(set)317 # normalized_text = []318 # logger.info("Precomputing phrase counts")319 # for j, word_list in enumerate(tqdm(self.train['word_lists'])):320 # normalized_text.append(" ".join(word_list))321 # # normalized_text.append(self.tokenizer.decode(self.tokenizer.encode(word_list)[1:-1]))322 # for l in range(1, 1 + self.args.max_rule_length):323 # phrases = [" ".join(word_list[i:i+l]) for i in range(len(word_list) - l + 1)]324 # for p in phrases:325 # if any([punct in p for punct in '.,!?"\\']):326 # continue327 # phrase_counts[p] += 1328 # phrase_inds[p].add(j)329 # self.train['text'] = normalized_text330 # self.phrase_counts = {k:v for k, v in phrase_counts.items() if v >= self.min_count_cutoff and k not in self.words_to_exclude}331 # logger.debug(f"Num Phrases: {len(self.phrase_counts)}")332 # self.phrase_inds = {k:list(phrase_inds[k]) for k in self.phrase_counts.keys()}333 def update_rule_map(self, kwds):334 for kwd in kwds:335 self.rule_map[kwd] = self.mask_id336 def token_match(self, token, alg='random', n=5):337 '''338 Match examples based on token339 '''340 pass341 def phrase_match(self, phrase, alg='random', n=5):342 '''343 Match examples based on phrase344 '''345 pass346 # Needs updating for whole words/phrases347 def noise_input_tokens(self, seq, p=1):348 '''349 Add noise to input sequences for MLM loss350 Inputs:351 -------352 seq: Input sequence on which to mask tokens353 p: Probability with which to mask each token from a rule354 '''355 rule_tokens = torch.tensor([self.is_rule[w.item()] for w in seq]).bool()356 # rule_mask_ps = (torch.ones_like(rule_tokens) * p)357 # rule_draws = torch.bernoulli(rule_mask_ps).bool()358 # masked_rule_tokens = (rule_tokens & rule_draws)359 # MLM Loss360 ps = self.p * torch.ones_like(seq)361 mlm_mask = (torch.bernoulli(ps).bool() & (seq >= self.num_special_tokens))362 # mask = (mlm_mask | masked_rule_tokens)363 mask = (mlm_mask | rule_tokens)364 # # Debugging365 # if rule_tokens.sum() > 0:366 # logger.debug(rule_tokens.sum())367 # if mlm_mask.sum() != mask.sum():368 # logger.debug(f"mlm_mask: {mlm_mask.sum()}")369 # logger.debug(f"mask: {mask.sum()}")370 # logger.debug("mask should be larger")371 # Labels372 mlm_labels = seq.clone()373 mlm_labels[~mask] = -100374 # Get masks of how to noise tokens375 a = torch.rand(seq.size())376 mask_token_locs = (mask & (a < .8))377 random_token_locs = (mask & (a > .9))378 num_random = random_token_locs.sum()379 random_tokens = torch.randint(low=self.num_special_tokens, 380 high=self.vocab_size, 381 size=(num_random.item(),))382 # Noise input ids383 noised_ids = seq.clone()384 noised_ids[mask_token_locs] = self.mask_id385 noised_ids[random_token_locs] = random_tokens386 return noised_ids, mlm_labels387 388 def __len__(self):389 return self.length390 def __getitem__(self, i):391 idx = self.idx_map[i]392 393 seq = self.encoded_text[idx]394 attn_mask = self.attention_masks[idx]395 labels = self.labels[idx]396 noisy_labels = self.noisy_labels[idx]397 noised_ids, mlm_labels = self.noise_input_tokens(seq)398 starts = self.word_starts[idx]399 ends = self.word_ends[idx]400 soft_labels = self.soft_labels[idx]401 output_dict = {'input_ids': seq, 402 'attention_masks': attn_mask, 403 'labels': labels, 404 'noisy_labels':noisy_labels, 405 'noised_ids': noised_ids, 406 'mlm_labels': mlm_labels, 407 'word_starts':starts, 408 'word_ends': ends,409 'soft_labels': soft_labels,410 }411 # return seq, attn_mask, labels, noisy_labels, noised_ids, mlm_labels, starts, ends412 return output_dict 413 def save(self, filepath):414 '''415 Save data module to file416 '''417 with open(filepath, 'wb') as f:418 pickle.dump(self.__dict__, f)419 @classmethod420 def load(self, filepath):421 '''422 Load data module from file423 '''424 with open(filepath, 'wb') as f:425 self.__dict__ = pickle.load(f)426class RegalDataset(Dataset):427 # RPN Dataset to mask keywords used in rules428 def __init__(self, 429 text,430 encoded_text,431 attention_masks, 432 labels,433 tokenizer, 434 rules,435 mask_prob=.1):436 '''437 Initialize dataset class438 Inputs:439 text: List of str440 Input text of datapoints to classify441 labels: List of torch.LongTensor442 Labels corresponding to each datapoint443 tokenizer: 444 Huggingface tokenizer object to encode text445 Rules: List of Rule446 Labeling functions to create noisy labels447 '''448 self.text = data['text']449 self.encoded_text = data['encoded_text']450 self.attention_masks = data['attention_masks']451 self.labels = data['labels']452 self.noisy_labels = data['noisy_labels']453 # Tokenizer attributes454 self.tokenizer = tokenizer455 self.word2id = tokenizer.vocab456 self.mask_id = self.word2id['[MASK]']457 458 # Get vocab size459 self.vocab_size = int(np.max(list(self.word2id.values())) + 1)460 self.num_special_tokens = int(np.max([val for key, val in self.word2id.items() if key.startswith('[')]) + 1)461 # Rule attributes462 self.rule_tokens = rule_tokens463 self.rule_map = {val:val for val in self.word2id.values()}464 self.update_rule_map(rule_tokens)465 self.is_rule = {val:0 for val in self.word2id.values()}466 for w in rule_tokens:467 self.is_rule[self.word2id[w]] = 1468 469 # Misc attributes470 self.p = mask_prob471 self.length = len(self.text)472 def __len__(self):473 '''474 Length attribute475 '''476 return self.length477 def __getitem__(self, idx):478 '''479 Return items from dataset for dataloader480 '''481 seq = self.encoded_text[idx]482 attn_mask = self.attention_masks[idx]483 labels = self.labels[idx]484 noisy_labels = self.noisy_labels[idx]485 noised_ids, mlm_labels = self.noise_input_tokens(seq)...
parameter_test.py
Source:parameter_test.py
...68 start69 = {rule_positional | rule_keywords | rule_all} $ ;70 rule_positional('ABC', 123, '=', '+')71 = 'a' ;72 rule_keywords(k1=ABC, k3='=', k4='+', k2=123)73 = 'b' ;74 rule_all('DEF', 456, '=', '+', k1=HIJ, k3='=', k4='+', k2=789)75 = 'c' ;76 '''77 pretty = '''78 @@ignorecase :: False79 @@nameguard :: True80 start81 =82 {rule_positional | rule_keywords | rule_all} $83 ;84 rule_positional(ABC, 123, '=', '+')85 =86 'a'87 ;88 rule_keywords(k1=ABC, k3='=', k4='+', k2=123)89 =90 'b'91 ;92 rule_all(DEF, 456, '=', '+', k1=HIJ, k3='=', k4='+', k2=789)93 =94 'c'95 ;96 '''97 model = compile(grammar, 'RuleArguments')98 self.assertEqual(trim(pretty), str(model))99 model = compile(pretty, 'RuleArguments')100 ast = model.parse("a b c")101 self.assertEqual(['a', 'b', 'c'], ast)102 semantics = TC36Semantics()103 ast = model.parse("a b c", semantics=semantics)104 self.assertEqual(['a', 'b', 'c'], ast)105 codegen(model)106 def test_36_unichars(self):107 grammar = '''108 start = { rule_positional | rule_keywords | rule_all }* $ ;109 rule_positional("ÃÃÃäöüÃ") = 'a' ;110 rule_keywords(k1='äöüÃÃÃÃ') = 'b' ;111 rule_all('ÃÃÃÃäöü', k1="ÃäöüÃÃÃ") = 'c' ;112 '''113 def _trydelete(pymodule):114 import os115 try:116 os.unlink(pymodule + ".py")117 except EnvironmentError:118 pass119 try:120 os.unlink(pymodule + ".pyc")121 except EnvironmentError:122 pass123 try:124 os.unlink(pymodule + ".pyo")...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!