Best Python code snippet using autotest_python
location_predict.py
Source:location_predict.py
1# coding: utf-82# In[133]:3import numpy as np4import pandas as pd5import re6import xgboost as xgb7from sklearn import svm8from collections import defaultdict9from sklearn.feature_extraction.text import TfidfVectorizer10from sklearn.linear_model import LogisticRegression11from sklearn.cross_validation import StratifiedShuffleSplit12from sklearn.preprocessing import LabelEncoder13from sklearn.cross_validation import StratifiedKFold14#读å
¥è®ç»æµè¯æ°æ®ï¼å并15train_data = pd.read_csv('./data/train/train_labels.txt',sep=u'|',header=None).dropna(1)16train_data.columns = ['uid','sex','age','location']17test_data = pd.read_csv('./data/valid/valid_nolabel.txt',sep=u'|',header=None).dropna(1)18test_data.columns = ['uid']19total_data = pd.concat([train_data,test_data],axis=0)20#读å
¥è®ç»æµè¯infoæ°æ®ï¼å并21train_data_info = pd.read_csv('./data/train/train_info.txt',sep=u'|',header=None).dropna(1)22train_data_info.columns = ['uid','name','image']23train_data_info = train_data_info.drop_duplicates()24test_data_info = pd.read_csv('./data/valid/valid_info.txt',sep=u'|',header=None).dropna(1)25test_data_info.columns = ['uid','name','image']26test_data_info = test_data_info.drop_duplicates()27total_data_info = pd.concat([train_data_info,test_data_info],axis=0)28total_data_info = total_data_info.drop_duplicates('uid')29#读å
¥è®ç»æµè¯linksæ°æ®ï¼å并30links = []31for i, line in enumerate(open('./data/train/train_links.txt')):32 line = line.split()33 row = {'uid':int(line[0]),'sum_fans':len(line)-1,'fans':' '.join(line[1:])}34 links.append(row)35train_data_links = pd.DataFrame(links)36train_data_links = train_data_links.drop_duplicates()37links = []38for i, line in enumerate(open('./data/valid/valid_links.txt')):39 line = line.split()40 row = {'uid':int(line[0]),'sum_fans':len(line)-1,'fans':' '.join(line[1:])}41 links.append(row)42test_data_links = pd.DataFrame(links)43test_data_links = test_data_links.drop_duplicates()44total_data_links = pd.concat([train_data_links,test_data_links],axis=0)45#读å
¥è®ç»æµè¯statusæ°æ®ï¼å并46status = []47for i, line in enumerate(open('./data/train/train_status.txt')):48 49 l = re.search(',',line).span()[0]50 r = re.search(',',line).span()[1]51 row = {'uid':int(line[:l]),'sta':line[r:]}52 status.append(row)53train_data_status = pd.DataFrame(status)54status = []55for i, line in enumerate(open('./data/valid/valid_status.txt')):56 57 l = re.search(',',line).span()[0]58 r = re.search(',',line).span()[1]59 row = {'uid':int(line[:l]),'sta':line[r:]}60 status.append(row)61test_data_status = pd.DataFrame(status)62total_data_status = pd.concat([train_data_status,test_data_status],axis=0)63#å并é¢ç®ç»çå 个表æ°æ®64merge_data = pd.merge(total_data,total_data_info,on='uid',how='left')65merge_data = pd.merge(merge_data,total_data_links,on='uid',how='left')66merge_data.index = range(len(merge_data))67##################################################################################68total_data_status['retweet'] = total_data_status.sta.map(lambda s:int(s.split(',')[0]))69total_data_status['review'] = total_data_status.sta.map(lambda s:int(s.split(',')[1]))70total_data_status['source'] = total_data_status.sta.map(lambda s:s.split(',')[2])71total_data_status['time'] = total_data_status.sta.map(lambda s:s.split(',')[3])72total_data_status['content'] = total_data_status.sta.map(lambda s:','.join(s.split(',')[4:]))73contents = total_data_status.groupby('uid')['content'].agg(lambda lst:' '.join(lst))74merge_data['contents'] = merge_data.uid.map(contents)75merge_data['sum_content'] = merge_data.uid.map(total_data_status.groupby('uid').size())76#ç»è®¡ç¹å¾77merge_data['max_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('max'))78merge_data['max_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('max'))79merge_data['min_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('min'))80merge_data['min_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('min'))81merge_data['median_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('median'))82merge_data['median_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('median'))83merge_data['mean_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('mean'))84merge_data['mean_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('mean'))85merge_data['std_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('std'))86merge_data['std_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('std'))87#locationå°åºæ å°è¯è¡¨88d = {'ç³å®¶åº': 'åå',89 'åéµåº': 'åä¸',90 'æ·±å³': 'åå',91 '广å·': 'åå',92 'å®å®': 'åå',93 'ååº': 'åä¸',94 'æ²å¸': 'åä¸',95 'æ¦æ±': 'åä¸',96 'è¥é³': 'åä¸',97 'å®é': 'åä¸',98 'èé¨': 'åä¸',99 '西å®': '西å',100 'é¶å·': '西å',101 'æé½': '西å',102 '绵é³': '西å',103 'ä¸æµ·': 'åä¸',104 'äºå': '西å',105 'å
èå¤': 'åå',106 'å京': 'åå',107 'å°æ¹¾': 'åä¸',108 'åæ': 'ä¸å',109 'åå·': '西å',110 '天津': 'åå',111 'å®å¤': '西å',112 'å®å¾½': 'åä¸',113 'å±±ä¸': 'åä¸',114 '山西': 'åå',115 'è¾½å®': 'ä¸å',116 'éåº': '西å',117 'é西': '西å',118 'éæµ·': '西å',119 'é¦æ¸¯': 'åå',120 'é»é¾æ±': 'ä¸å',121 'é¿ç½': 'ä¸å',122 '丹ä¸': 'ä¸å',123 '大庸桥': 'ä¸å',124 'æ²é³': 'ä¸å',125 '大è¿': 'ä¸å',126 'æ顺': 'ä¸å',127 'ç³å®¶åº': 'åå',128 'æé³': 'åå',129 '广ä¸': 'åå',130 '广西': 'åå',131 'æ°ç': '西å',132 'æ±è': 'åä¸',133 'æ±è¥¿': 'åä¸',134 'æ²³å': 'åå',135 'æ²³å': 'åä¸',136 'æµæ±': 'åä¸',137 'æµ·å': 'åå',138 'æ¹å': 'åä¸',139 'æ¹å': 'åä¸',140 'æ¾³é¨': 'åå',141 'çè': '西å',142 'ç¦å»º': 'åä¸',143 '西è': '西å',144 'è´µå·': '西å',145}146#å°locationåage转åæéè¦æ交çèå´147def trans_loc(s):148 if pd.isnull(s):149 return s150 s = s.split(' ')[0]151 if s == 'None':152 return 'åå'153 if s == 'æµ·å¤':154 return 'å¢å¤'155 return d[s]156def trans_age(age):157 if pd.isnull(age):158 return age159 if age <=1979:160 return "-1979"161 elif age<=1989:162 return "1980-1989"163 else:164 return "1990+"165merge_data['location2'] = merge_data['location'].map(trans_loc)166merge_data['age2'] = merge_data['age'].map(trans_age)167src_lst = total_data_status.groupby('uid')['source'].agg(lambda lst:' '.join(lst))168merge_data['source_content'] = merge_data['uid'].map(src_lst) 169keys = '|'.join(d.keys())170merge_data['source_province'] = merge_data['source_content'].map(lambda s:' '.join(re.findall(keys,s)))171merge_data['num_province'] = merge_data['contents'].map(lambda s:' '.join(re.findall(keys,s)))172d = defaultdict(lambda :'空',d)173tokenizer = lambda line: [d[w] for w in line.split(' ')]174tfv = TfidfVectorizer(tokenizer=tokenizer,norm=False, use_idf=False, smooth_idf=False, sublinear_tf=False)175X_all_sp = tfv.fit_transform(merge_data['num_province'])176sum_province = X_all_sp.toarray()177for i in range(sum_province.shape[1]):178 merge_data['sum_province_%d'%i] = sum_province[:,i]179length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.mean([len(s.split(' ')) for s in lst]))180merge_data['max_content_len'] = merge_data['uid'].map(length)181length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.min([len(s.split(' ')) for s in lst]))182merge_data['min_content_len'] = merge_data['uid'].map(length)183length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.max([len(s.split(' ')) for s in lst]))184merge_data['mean_content_len'] = merge_data['uid'].map(length)185merge_data['name_len'] = merge_data.name.map(lambda s:s if pd.isnull(s) else len(re.sub(r'[\u4e00-\u9fff]+','',s)))186def num_missing(x): 187 return sum(x.isnull()) 188merge_data['num_missing'] = merge_data.apply(num_missing, axis=1) 189#rankç¹å¾190merge_data['rank_sum_content'] = merge_data['sum_content'].rank(method='max')191merge_data['rank_sum_fans'] = merge_data['sum_fans'].rank(method='max')192merge_data['rank_mean_retweet'] = merge_data['mean_retweet'].rank(method='max')193merge_data['rank_mean_review'] = merge_data['mean_review'].rank(method='max')194merge_data['rank_num_missing'] = merge_data['num_missing'].rank(method='max')195# In[134]:196#导å
¥ä½¿ç¨tfidfç¹å¾è®ç»ç模åçé¢æµç»æï¼éç¨stackingèåï¼æé¢æµç»æä½ä¸ºæ°ç¹å¾å è¿æ¨¡åï¼197tfidf_stacking = pd.read_csv('./data/newfeat/stack_new.csv')198merge_data = pd.concat([merge_data,tfidf_stacking],axis=1)199feat_time_3hour = pd.read_csv('./data/newfeat/feat_time_3hour.csv')200merge_data = pd.merge(merge_data,feat_time_3hour,on='uid',how='left')201#导å
¥ä½¿ç¨word2vecç¹å¾è®ç»ç模åçé¢æµç»æ202w2v_stacking = pd.read_csv('./data/newfeat/w2v_prob1.csv')203merge_data = pd.merge(merge_data,w2v_stacking,on='uid',how='left')204newmerge_feat1 = pd.read_csv('./data/newfeat/newmerge_feat.csv')205merge_data = pd.merge(merge_data,newmerge_feat1,on='uid',how='left')206feat_area1 = pd.read_csv('./data/newfeat/feat_area.csv')207merge_data = pd.merge(merge_data,feat_area1,on='uid',how='left')208# In[135]:209#########################################################################################210cols = '|'.join(['twts_len','name_len','sum_province','sum_fans',211 'age_','sex_','loc_',212 'mean_retweet','sum_content','mean_review','num_missing',213 'w2v_f_prob','w2v_m_prob','w2v_young_prob','w2v_old_prob','w2v_mid_prob',214 'max_retweet','min_retweet','max_review','min_review',215 'rank_sum_content','rank_sum_fans','rank_mean_retweet','rank_mean_review','rank_num_missing',216 'timePeriod_3hour_0','timePeriod_3hour_1','timePeriod_3hour_2','timePeriod_3hour_3',217 'timePeriod_3hour_4','timePeriod_3hour_5','timePeriod_3hour_6','timePeriod_3hour_7',218 'name_isnull','image_isnull','fans_isnull','retweet_isnull','review_isnull',219 'area_0','area_1','area_2','area_3','area_4','area_5','area_6','area_7'220 ])221cols = [c for c in merge_data.columns if re.match(cols,c)]222age_le = LabelEncoder()223ys = {}224ys['age'] = age_le.fit_transform(merge_data.iloc[:3200]['age2'])225loc_le = LabelEncoder()226ys['loc'] = loc_le.fit_transform(merge_data.iloc[:3200]['location2'])227sex_le = LabelEncoder()228ys['sex'] = sex_le.fit_transform(merge_data.iloc[:3200]['sex'])229merge_data = merge_data.fillna(0)230# In[136]:231task = ['tr']232TR = 3200233TE = 1240234X_all = merge_data[cols]235X = X_all[:TR]236prds = []237# In[137]:238from sklearn.preprocessing import PolynomialFeatures239poly=PolynomialFeatures(2)240X_poly=poly.fit_transform(X_all)241from sklearn.feature_selection import VarianceThreshold242vt=VarianceThreshold(0.001)243X_poly=vt.fit_transform(X_poly)244from sklearn.preprocessing import StandardScaler245ss=StandardScaler()246X_poly=ss.fit_transform(X_poly)247# In[138]:248X_poly=pd.DataFrame(X_poly)249X_poly.columns='Poly_'+X_poly.columns.astype(str)250# In[144]:251X_train=X_poly[:TR]252X_test=X_poly[TR:]253# In[145]:254#label=pd.read_csv('newlabel.csv',header=None,index_col=0)255#label.columns=['uid','age','gender','province']256# In[146]:257merge_data.iloc[:3200]['location2'].value_counts()258# In[416]:259########################260#å°åºé¢æµé¨å261label = 'loc'262print('='*20)263print(label)264print('='*20)265y = ys[label]266dtrain=xgb.DMatrix(X_train,y)267dtest=xgb.DMatrix(X_test)268# In[424]:269params = {270 "objective": "multi:softprob",271 "booster": "gbtree",272 "eval_metric": "merror",273 "num_class":8,274 'max_depth':4,275 #'min_child_weight':2.5,276 'subsample':0.65,277 'colsample_bytree':1.0,278 'gamma':2.5,279 "eta": 0.006,280 #"lambda":1,281 #'alpha':0,282 "silent": 1,283 #'seed':1123284}285xgb1=xgb.train(params,dtrain,num_boost_round=25)286# In[425]:287pre=xgb1.predict(dtest,ntree_limit=25)288pre_loc=[loc_le.classes_[idx] for idx in pre.argmax(1)]289sub = pd.DataFrame()290sub['uid'] = merge_data.iloc[TR:]['uid']291sub['province'] = pre_loc292sub.to_csv('./data/location_sub.csv',index=False)293# In[426]:294#loc_pro2=pd.DataFrame(pre,columns=loc_le.classes_,index=test_data.uid)295# In[427]:...
smp_merge_data.py
Source:smp_merge_data.py
1import numpy as np2import pandas as pd3import re4import xgboost as xgb5from sklearn import svm6from collections import defaultdict7from sklearn.feature_extraction.text import TfidfVectorizer8from sklearn.linear_model import LogisticRegression9from sklearn.cross_validation import StratifiedShuffleSplit10from sklearn.preprocessing import LabelEncoder11path = './data/'12#â¡»Ãⵡââ¤â â ËæâºÂ£Â¨â«Åâ¤Â¢13train_data = pd.read_csv( path + 'train/train_labels.txt',sep=u'|',header=None).dropna(1)14train_data.columns = ['uid','sex','age','location']15test_data = pd.read_csv(path + 'valid/valid_nolabel.txt',sep=u'|',header=None).dropna(1)16test_data.columns = ['uid']17total_data = pd.concat([train_data,test_data],axis=0)18#â¡»Ãⵡââ¤â âinfo ËæâºÂ£Â¨â«Åâ¤Â¢19train_data_info = pd.read_csv( path + 'train/train_info.txt',sep=u'|',header=None).dropna(1)20train_data_info.columns = ['uid','name','image']21train_data_info = train_data_info.drop_duplicates('uid')22test_data_info = pd.read_csv(path + 'valid/valid_info.txt',sep=u'|',header=None).dropna(1)23test_data_info.columns = ['uid','name','image']24test_data_info = test_data_info.drop_duplicates('uid')25total_data_info = pd.concat([train_data_info,test_data_info],axis=0)26#â¡»Ãⵡââ¤â âlinks ËæâºÂ£Â¨â«Åâ¤Â¢27links = []28for i, line in enumerate(open(path + 'train/train_links.txt')):29 line = line.split()30 row = {'uid':int(line[0]),'sum_fans':len(line)-1,'fans':' '.join(line[1:])}31 links.append(row)32train_data_links = pd.DataFrame(links)33train_data_links = train_data_links.drop_duplicates()34links = []35for i, line in enumerate(open(path + 'valid/valid_links.txt')):36 line = line.split()37 row = {'uid':int(line[0]),'sum_fans':len(line)-1,'fans':' '.join(line[1:])}38 links.append(row)39test_data_links = pd.DataFrame(links)40test_data_links = test_data_links.drop_duplicates()41total_data_links = pd.concat([train_data_links,test_data_links],axis=0)42#â¡»Ãⵡââ¤â âstatus ËæâºÂ£Â¨â«Åâ¤Â¢43status = []44for i, line in enumerate(open(path + 'train/train_status.txt')):45 46 l = re.search(',',line).span()[0]47 r = re.search(',',line).span()[1]48 row = {'uid':int(line[:l]),'sta':line[r:]}49 status.append(row)50train_data_status = pd.DataFrame(status)51status = []52for i, line in enumerate(open(path + 'valid/valid_status.txt')):53 54 l = re.search(',',line).span()[0]55 r = re.search(',',line).span()[1]56 row = {'uid':int(line[:l]),'sta':line[r:]}57 status.append(row)58test_data_status = pd.DataFrame(status)59total_data_status = pd.concat([train_data_status,test_data_status],axis=0)60#â«Åâ¤Â¢ÃâÆø⯵ƺââ˱àËæâº61merge_data = pd.merge(total_data,total_data_info,on='uid',how='left')62merge_data = pd.merge(merge_data,total_data_links,on='uid',how='left')63merge_data.index = range(len(merge_data))64##################################################################################65total_data_status['retweet'] = total_data_status.sta.map(lambda s:int(s.split(',')[0]))66total_data_status['review'] = total_data_status.sta.map(lambda s:int(s.split(',')[1]))67total_data_status['source'] = total_data_status.sta.map(lambda s:s.split(',')[2])68total_data_status['time'] = total_data_status.sta.map(lambda s:s.split(',')[3])69total_data_status['content'] = total_data_status.sta.map(lambda s:','.join(s.split(',')[4:]))70contents = total_data_status.groupby('uid')['content'].agg(lambda lst:' '.join(lst))71merge_data['contents'] = merge_data.uid.map(contents)72merge_data['sum_content'] = merge_data.uid.map(total_data_status.groupby('uid').size())73#Ãâ¥ÂºâÃÿâË74merge_data['max_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('max'))75merge_data['max_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('max'))76merge_data['min_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('min'))77merge_data['min_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('min'))78merge_data['median_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('median'))79merge_data['median_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('median'))80merge_data['mean_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('mean'))81merge_data['mean_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('mean'))82merge_data['std_retweet'] = merge_data.uid.map(total_data_status.groupby('uid')['retweet'].agg('std'))83merge_data['std_review'] = merge_data.uid.map(total_data_status.groupby('uid')['review'].agg('std'))84#locationµÿ«¯ââ¥â¦â°Â¥Â ±Ã85d = {' ÃºââÃ': 'ªâ¢Â±Â±',86 'ââ¡ÃâÃ': 'ªâ¢â´',87 'â¦Ãâ¬â': 'ªâ¢ÆÅ',88 'Ïâ÷âº': 'ªâ¢ÆÅ',89 '±¶ââ¤': 'ªâ¢ÆÅ',90 '¡ıâÃ': 'ªâ¢Ã·â',91 'â¦â¥Â â': 'ªâ¢Ã·â',92 'Åâ°â«â«': 'ªâ¢Ã·â',93 'ÅÃâÃ': 'ªâ¢Ã·â',94 'ââ¤Â¬Î©': 'ªâ¢Ã·â',95 'æ£ââ': 'ªâ¢Ã·â',96 'ÅËââ¤': 'Å˱±',97 '⯥®': 'Å˱±',98 'â¥â¦âº': 'ÅËÆÅ',99 'ââ¡âÃ': 'ÅËÆÅ',100 'â¦Åâ«Â£': 'ªâ¢â´',101 'ââÆÅ': 'ÅËÆÅ',102 'Æâââ¦Ïâ': 'ªâ¢Â±Â±',103 '±±æ©': 'ªâ¢Â±Â±',104 'îÃÃ': 'ªâ¢â´',105 'ºâ¢Â¡Ã·': 'â´±±',106 'ÃÆ¥®': 'ÅËÆÅ',107 'ÃÃΩÃ': 'ªâ¢Â±Â±',108 'ÆËÅÆ': 'Å˱±',109 'ââ¤Âªâ': 'ªâ¢â´',110 'â¦Î©â´': 'ªâ¢â´',111 'â¦Î©ÅË': 'ªâ¢Â±Â±',112 '¡â¦ÆË': 'â´±±',113 '÷ÿ«Ã': 'ÅËÆÅ',114 'â¦Â¬ÅË': 'Å˱±',115 '«â¡â«Â£': 'Å˱±',116 'Åâââ¬': 'ªâ¢ÆÅ',117 'â«â¡ËΩâ ': 'â´±±',118 'â¥Â§ââ': 'â´±±',119 'µ§â´': 'â´±±',120 'Â¥ÃâÏ«â': 'â´±±',121 'â¦ÃâÃ': 'â´±±',122 '¥á¨': 'â´±±',123 'âÃÃâ¥': 'â´±±',124 ' ÃºââÃ': 'ªâ¢Â±Â±',125 'â¥ÃâÃ': 'ªâ¢Â±Â±',126 'Ïââ´': 'ªâ¢ÆÅ',127 'ÏâÅË': 'ªâ¢ÆÅ',128 'â¬ΩÃ': 'Å˱±',129 'Ωâ Ãâ': 'ªâ¢â´',130 'Ωâ ÅË': 'ªâ¢â´',131 'â«â±±': 'ªâ¢Â±Â±',132 'â«âÆÅ': 'ªâ¢Ã·â',133 'ââΩâ ': 'ªâ¢â´',134 'â«Â£ÆÅ': 'ªâ¢ÆÅ',135 'â«Ë±±': 'ªâ¢Ã·â',136 'â«ËÆÅ': 'ªâ¢Ã·â',137 'âÆââ': 'ªâ¢ÆÅ',138 'â Ãâ¡': 'Å˱±',139 'â£Ω®': 'ªâ¢â´',140 'ÅËâ¤Ã¿': 'ÅËÆÅ',141 'ÏÃ÷âº': 'ÅËÆÅ',142}143#Ω´locationâ«Ãageââ¢ÂªÃâ¥â¦âÃââ¢Ã·ΩªµÆââÅÃ144def trans_loc(s):145 if pd.isnull(s):146 return s147 s = s.split(' ')[0]148 if s == 'None':149 return 'ªâ¢Â±Â±'150 if s == 'â«Â£Ãâ':151 return s152 return d[s]153def trans_age(age):154 if pd.isnull(age):155 return age156 if age <=1979:157 return "-1979"158 elif age<=1989:159 return "1980-1989"160 else:161 return "1990+"162merge_data['location2'] = merge_data['location'].map(trans_loc)163merge_data['age2'] = merge_data['age'].map(trans_age)164src_lst = total_data_status.groupby('uid')['source'].agg(lambda lst:' '.join(lst))165merge_data['source_content'] = merge_data['uid'].map(src_lst) 166keys = '|'.join(d.keys())167merge_data['source_province'] = merge_data['source_content'].map(lambda s:' '.join(re.findall(keys,s)))168merge_data['num_province'] = merge_data['contents'].map(lambda s:' '.join(re.findall(keys,s)))169d = defaultdict(lambda :'øâ',d)170tokenizer = lambda line: [d[w] for w in line.split(' ')]171tfv = TfidfVectorizer(tokenizer=tokenizer,norm=False, use_idf=False, smooth_idf=False, sublinear_tf=False)172X_all_sp = tfv.fit_transform(merge_data['num_province'])173sum_province = X_all_sp.toarray()174for i in range(sum_province.shape[1]):175 merge_data['sum_province_%d'%i] = sum_province[:,i]176length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.mean([len(s.split(' ')) for s in lst]))177merge_data['max_content_len'] = merge_data['uid'].map(length)178length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.min([len(s.split(' ')) for s in lst]))179merge_data['min_content_len'] = merge_data['uid'].map(length)180length = total_data_status.groupby('uid')['content'].agg(lambda lst:np.max([len(s.split(' ')) for s in lst]))181merge_data['mean_content_len'] = merge_data['uid'].map(length)182merge_data['name_len'] = merge_data.name.map(lambda s:s if pd.isnull(s) else len(re.sub(r'[\u4e00-\u9fff]+','',s)))183def num_missing(x): 184 return sum(x.isnull()) 185merge_data['num_missing'] = merge_data.apply(num_missing, axis=1) 186#rankÃÿâË187merge_data['rank_sum_content'] = merge_data['sum_content'].rank(method='max')188merge_data['rank_sum_fans'] = merge_data['sum_fans'].rank(method='max')189merge_data['rank_mean_retweet'] = merge_data['mean_retweet'].rank(method='max')190merge_data['rank_mean_review'] = merge_data['mean_review'].rank(method='max')191merge_data['rank_num_missing'] = merge_data['num_missing'].rank(method='max')...
prepare_data.py
Source:prepare_data.py
1import numpy as np2import pandas as pd3from sklearn.linear_model import LinearRegression4import sys56args = sys.argv7order_file_name = args[1]8group_file_name = args[2]9airline_file_name = args[3]10train_set_file_name = args[4]11total_dataset = args[5]121314order = pd.read_csv(order_file_name)15group = pd.read_csv(group_file_name)16airline = pd.read_csv(airline_file_name)1718group['product_name_price_min'] = group['price'].groupby(group['product_name']).transform('min')19merge_data = order.merge(group, on=['group_id'], how='left')2021#cpå¼:å¹é¢/天æ¸22merge_data["cp"] = merge_data["price"] / merge_data['days']2324#source25source_1_dummy = pd.get_dummies(merge_data["source_1"] )26source_2_dummy = pd.get_dummies(merge_data["source_2"] )27merge_data = pd.concat([merge_data , source_1_dummy] , axis = 1)28merge_data = pd.concat([merge_data , source_2_dummy] , axis = 1)29303132#åä¸è¡ç¨æ¯å¦çºæä½å¹33merge_data['price-min'] = merge_data['price'] - merge_data['product_name_price_min']34merge_data['is-min-price'] = 035merge_data.loc[merge_data['price-min'] == 0 , ['is-min-price']] = 13637#åä¸è¡ç¨ææå¤å°è¨å®38merge_data['num_same_group'] = merge_data[['order_id']].groupby(merge_data['group_id']).transform('count')394041#åæè¡ç¨ç¸½å
±å¤å°äºº42merge_data['total_people_amount'] = merge_data[['people_amount']].groupby(merge_data['group_id']).transform('sum')434445#è¡ç¨ åªæ 46merge_data["discount"] = 047merge_data.loc[merge_data["product_name"].str.contains("ç") == True , ["discount"]] = 148merge_data.loc[merge_data["product_name"].str.contains("æ") == True , ["discount"]] = 149merge_data.loc[merge_data["product_name"].str.contains("è´") == True , ["discount"]] = 150merge_data.loc[merge_data["product_name"].str.contains("é") == True , ["discount"]] = 151merge_data.loc[merge_data["product_name"].str.contains("æ¸") == True , ["discount"]] = 152merge_data.loc[merge_data["product_name"].str.contains("åªæ ") == True , ["discount"]] = 153merge_data.drop(['product_name'], axis=1, inplace=True)545556#æéæ ¼å¼è½æï¼ä»¥åæéèç57merge_data['begin_date'] = pd.to_datetime(merge_data['begin_date'])58merge_data['order_date'] = pd.to_datetime(merge_data['order_date'])59merge_data['begin_date_month'] = merge_data["begin_date"].dt.month60merge_data['order_date_month'] = merge_data["order_date"].dt.month61merge_data['order_date_dayofweek'] = merge_data['order_date'].dt.dayofweek62merge_data['begin_date_dayofweek'] = merge_data['begin_date'].dt.dayofweek63merge_data['order_date_isweekend'] = 064merge_data['begin_date_isweekend'] = 065merge_data.loc[merge_data['order_date_dayofweek'] == 5 , ['order_date_isweekend']] = 166merge_data.loc[merge_data['order_date_dayofweek'] == 6 , ['order_date_isweekend']] = 167merge_data.loc[merge_data['begin_date_dayofweek'] == 5 , ['order_date_isweekend']] = 168merge_data.loc[merge_data['begin_date_dayofweek'] == 6 , ['order_date_isweekend']] = 16970717273# èªçèç74#å»ç¨èµ·é£æéï¼åç¨æµéæé75go_fly = airline[["group_id" , "fly_time" , "arrive_time"]]76go_fly['fly_time'] = airline['fly_time'].groupby(airline['group_id']).transform('min')77go_fly['fly_time'] = pd.to_datetime(go_fly['fly_time'])78go_fly['arrive_time'] = airline['arrive_time'].groupby(airline['group_id']).transform('max')79go_fly['arrive_time'] = pd.to_datetime(go_fly['arrive_time'])80go_fly = go_fly.drop_duplicates()81merge_data = merge_data.merge(go_fly, on=['group_id'], how='left')8283#æ´åè¡ç¨æäºå¹¾æ¬¡é£æ©84count = airline.groupby(['group_id']).size().to_frame("fly_count")85merge_data = merge_data.merge(count, on=['group_id'], how='left')8687#åªé¤æ²ç¨å°çæ¬ä½88merge_data.drop(['source_1'], axis=1, inplace=True)89merge_data.drop(['source_2'], axis=1, inplace=True)90merge_data.drop(['unit'], axis=1, inplace=True)91merge_data.drop(['area'], axis=1, inplace=True)92merge_data.drop(['sub_line'], axis=1, inplace=True)93merge_data.drop(['promotion_prog'], axis=1, inplace=True)949596training_set = pd.read_csv(train_set_file_name)97merge_data = merge_data.merge(training_set , on=['order_id'], how='left')98merge_data = merge_data.dropna() #åªé¤æ缺å¼çå99100print(merge_data.info())101merge_data.to_csv(total_dataset , index = False)102
...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!