Best Python code snippet using autotest_python
_1_encode_cat_features.py
Source:_1_encode_cat_features.py
1# -*- coding: utf-8 -*-2import pandas as pd3import numpy as np4import scipy as sc5import scipy.sparse as sp6from sklearn.utils import check_random_state 7import pylab 8import sys9import time10# sys.path.append('/home/zzhang/Downloads/xgboost/wrapper')11import xgboost as xgb12from joblib import dump, load, Parallel, delayed13import utils14from utils import *15raw_data_path = utils.raw_data_path16tmp_data_path = utils.tmp_data_path17train_data = pd.read_csv(raw_data_path + "train.csv", nrows=1e5)18test_data = pd.read_csv(raw_data_path + "test.csv", nrows=1e5)19# train_data = test_data.copy()20# ä»è®ç»æ ·æ¬ä¸éæºæ½å utils.sample_pct çæ ·æ¬æ¥è®ç», 1.0 表示å
¨é¨,21if utils.sample_pct < 1.0:22 np.random.seed(999)23 r1 = np.random.uniform(0, 1, train_data.shape[0])24 train_data = train_data.ix[r1 < utils.sample_pct, :]25 print("testing with small sample of training data, ", train_data.shape)26# æµè¯æ ·æ¬æ¯è®ç»æ ·æ¬å°äºlabelå±æ§27test_data['click'] = 028# å并æµè¯æ ·æ¬åæµè¯æ ·æ¬, ç»ä¸è¿è¡ç¹å¾å·¥ç¨å¤ç29all_data = pd.concat([train_data, test_data])30print("finished loading raw data, ", all_data.shape)31print("to add some basic features ...")32# å°hourç¹å¾è½¬æhour1, day_hour, day_hour_prev, day_hour_nextç¹å¾33all_data['day']=np.round(all_data.hour % 10000 / 100)34all_data['hour1'] = np.round(all_data.hour % 100)35all_data['day_hour'] = (all_data.day.values - 21) * 24 + all_data.hour1.values36all_data['day_hour_prev'] = all_data['day_hour'] - 137all_data['day_hour_next'] = all_data['day_hour'] + 138all_data['app_or_web'] = 039all_data.ix[all_data.app_id.values=='ecad2386', 'app_or_web'] = 140copy_data = all_data41copy_data['app_site_id'] = np.add(copy_data.app_id.values, copy_data.site_id.values)42print("to encode categorical features using mean responses from earlier days -- univariate")43sys.stdout.flush()44calc_exptv(copy_data, ['app_or_web'])45exptv_vn_list = ['app_site_id', 'as_domain', 'C14','C17', 'C21', 'device_model', 'device_ip', 'device_id', 'dev_ip_aw', 46 'app_site_model', 'site_model','app_model', 'dev_id_ip', 'C14_aw', 'C17_aw', 'C21_aw']47calc_exptv(copy_data, exptv_vn_list)48calc_exptv(copy_data, ['app_site_id'], add_count=True)49print("to encode categorical features using mean responses from earlier days -- multivariate")50vns = ['app_or_web', 'device_ip', 'app_site_id', 'device_model', 'app_site_model', 'C1', 'C14', 'C17', 'C21',51 'device_type', 'device_conn_type','app_site_model_aw', 'dev_ip_app_site']52dftv = copy_data.ix[np.logical_and(copy_data.day.values >= 21, copy_data.day.values < 32), ['click', 'day', 'id'] + vns].copy()53dftv['app_site_model'] = np.add(dftv.device_model.values, dftv.app_site_id.values)54dftv['app_site_model_aw'] = np.add(dftv.app_site_model.values, dftv.app_or_web.astype('string').values)55dftv['dev_ip_app_site'] = np.add(dftv.device_ip.values, dftv.app_site_id.values)56for vn in vns:57 dftv[vn] = dftv[vn].astype('category')58 print(vn)59n_ks = {'app_or_web': 100, 'app_site_id': 100, 'device_ip': 10, 'C14': 50, 'app_site_model': 50, 'device_model': 100, 'device_id': 50,60 'C17': 100, 'C21': 100, 'C1': 100, 'device_type': 100, 'device_conn_type': 100, 'banner_pos': 100,61 'app_site_model_aw': 100, 'dev_ip_app_site': 10 , 'device_model': 500}62exp2_dict = {}63for vn in vns:64 exp2_dict[vn] = np.zeros(dftv.shape[0])65days_npa = dftv.day.values66 67for day_v in range(22, 32):68 df1 = dftv.ix[np.logical_and(dftv.day.values < day_v, dftv.day.values < 31), :].copy()69 df2 = dftv.ix[dftv.day.values == day_v, :]70 print("Validation day:", day_v, ", train data shape:", df1.shape, ", validation data shape:", df2.shape)71 pred_prev = df1.click.values.mean() * np.ones(df1.shape[0])72 for vn in vns:73 if 'exp2_'+vn in df1.columns:74 df1.drop('exp2_'+vn, inplace=True, axis=1)75 for i in range(3):76 for vn in vns:77 p1 = calcLeaveOneOut2(df1, vn, 'click', n_ks[vn], 0, 0.25, mean0=pred_prev)78 pred = pred_prev * p179 print(day_v, i, vn, "change = ", ((pred - pred_prev)**2).mean())80 pred_prev = pred 81 82 pred1 = df1.click.values.mean()83 for vn in vns:84 print("="*20, "merge", day_v, vn)85 diff1 = mergeLeaveOneOut2(df1, df2, vn)86 pred1 *= diff187 exp2_dict[vn][days_npa == day_v] = diff188 89 pred1 *= df1.click.values.mean() / pred1.mean()90 print("logloss = ", logloss(pred1, df2.click.values))91 #print my_lift(pred1, None, df2.click.values, None, 20, fig_size=(10, 5))92 #plt.show()93for vn in vns:94 copy_data['exp2_'+vn] = exp2_dict[vn]95print("to count prev/current/next hour by ip ...")96cntDualKey(copy_data, 'device_ip', None, 'day_hour', 'day_hour_prev', fill_na=0)97cntDualKey(copy_data, 'device_ip', None, 'day_hour', 'day_hour', fill_na=0)98cntDualKey(copy_data, 'device_ip', None, 'day_hour', 'day_hour_next', fill_na=0)99print("to create day diffs")100copy_data['pday'] = copy_data.day - 1101calcDualKey(copy_data, 'device_ip', None, 'day', 'pday', 'click', 10, None, True, True)102copy_data['cnt_diff_device_ip_day_pday'] = copy_data.cnt_device_ip_day.values - copy_data.cnt_device_ip_pday.values103copy_data['hour1_web'] = copy_data.hour1.values104copy_data.ix[copy_data.app_or_web.values==0, 'hour1_web'] = -1105copy_data['app_cnt_by_dev_ip'] = my_grp_cnt(copy_data.device_ip.values.astype('string'), copy_data.app_id.values.astype('string'))106copy_data['hour1'] = np.round(copy_data.hour.values % 100)107copy_data['cnt_diff_device_ip_day_pday'] = copy_data.cnt_device_ip_day.values - copy_data.cnt_device_ip_pday.values108copy_data['rank_dev_ip'] = my_grp_idx(copy_data.device_ip.values.astype('string'), copy_data.id.values.astype('string'))109copy_data['rank_day_dev_ip'] = my_grp_idx(np.add(copy_data.device_ip.values, copy_data.day.astype('string').values).astype('string'), copy_data.id.values.astype('string'))110copy_data['rank_app_dev_ip'] = my_grp_idx(np.add(copy_data.device_ip.values, copy_data.app_id.values).astype('string'), copy_data.id.values.astype('string'))111copy_data['cnt_dev_ip'] = get_agg(copy_data.device_ip.values, copy_data.id, np.size)112copy_data['cnt_dev_id'] = get_agg(copy_data.device_id.values, copy_data.id, np.size)113copy_data['dev_id_cnt2'] = np.minimum(copy_data.cnt_dev_id.astype('int32').values, 300)114copy_data['dev_ip_cnt2'] = np.minimum(copy_data.cnt_dev_ip.astype('int32').values, 300)115copy_data['dev_id2plus'] = copy_data.device_id.values116copy_data.ix[copy_data.cnt_dev_id.values == 1, 'dev_id2plus'] = '___only1'117copy_data['dev_ip2plus'] = copy_data.device_ip.values118copy_data.ix[copy_data.cnt_dev_ip.values == 1, 'dev_ip2plus'] = '___only1'119copy_data['diff_cnt_dev_ip_hour_phour_aw2_prev'] = (copy_data.cnt_device_ip_day_hour.values - copy_data.cnt_device_ip_day_hour_prev.values) * ((copy_data.app_or_web * 2 - 1)) 120copy_data['diff_cnt_dev_ip_hour_phour_aw2_next'] = (copy_data.cnt_device_ip_day_hour.values - copy_data.cnt_device_ip_day_hour_next.values) * ((copy_data.app_or_web * 2 - 1)) 121print("to save copy_data ...")122dump(copy_data, tmp_data_path + 'copy_data.joblib_dat')123print("to generate copy_datatv_mx .. ")124app_or_web = None125_start_day = 22126list_param = ['C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'banner_pos', 'device_type', 'device_conn_type']127feature_list_dict = {}128feature_list_name = 'tvexp3'129feature_list_dict[feature_list_name] = list_param + \130 ['exptv_' + vn for vn in ['app_site_id', 'as_domain', 131 'C14','C17', 'C21', 'device_model', 'device_ip', 'device_id', 'dev_ip_aw', 132 'dev_id_ip', 'C14_aw', 'C17_aw', 'C21_aw']] + \133 ['cnt_diff_device_ip_day_pday', 134 'app_cnt_by_dev_ip', 'cnt_device_ip_day_hour', 'app_or_web',135 'rank_dev_ip', 'rank_day_dev_ip', 'rank_app_dev_ip',136 'diff_cnt_dev_ip_hour_phour_aw2_prev', 'diff_cnt_dev_ip_hour_phour_aw2_next',137 'exp2_device_ip', 'exp2_app_site_id', 'exp2_device_model', 'exp2_app_site_model',138 'exp2_app_site_model_aw', 'exp2_dev_ip_app_site',139 'cnt_dev_ip', 'cnt_dev_id', 'hour1_web']140filter_tv = np.logical_and(copy_data.day.values >= _start_day, copy_data.day.values < 31)141filter_t1 = np.logical_and(copy_data.day.values < 30, filter_tv)142filter_v1 = np.logical_and(~filter_t1, filter_tv) 143 144print(filter_tv.sum())145for vn in feature_list_dict[feature_list_name] :146 if vn not in copy_data.columns:147 print("="*60 + vn)148 149yv = copy_data.click.values[filter_v1]150copy_datatv_mx = copy_data.as_matrix(feature_list_dict[feature_list_name])151print(copy_datatv_mx.shape)152print("to save copy_datatv_mx ...")153copy_datatv_mx_save = {}154copy_datatv_mx_save['copy_datatv_mx'] = copy_datatv_mx155copy_datatv_mx_save['click'] = copy_data.click.values156copy_datatv_mx_save['day'] = copy_data.day.values157copy_datatv_mx_save['site_id'] = copy_data.site_id.values...
feuture_get.py
Source:feuture_get.py
1def lag(data, index_cols, lag_cols, mode = "day"):2 3 shift_range = [1, 2, 3, 4, 5, 12]45 for shift in tqdm_notebook(shift_range):6 shifted_data = data[index_cols + lag_cols].copy()78 # pd.DateOffset(seconds = shift)9 # "-" for data from future10 shifted_data['date'] -= pd.DateOffset(days = 1)1112 foo = lambda x: '{}_{}lag_{}'.format(x, mode, shift) if x in lag_cols else x13 shifted_data = shifted_data.rename(columns = foo)1415 data = pd.merge(data, shifted_data, on = index_cols, how = 'left').fillna(0) # or other NaN value1617 return data1819def first_extremum(data, delta_list, value_column, mode = 'max'):20 # data must have "date"21 copy_data = data.copy()22 23 for delta in delta_list:24 for value_label in value_column:25 26 if mode == 'max':27 max_mask = (copy_data[value_label].rolling(1 + 2 * delta , center = True).max() ==\28 copy_data[value_label])29 indexes = np.where(copy_data[value_label].rolling(1 + 2 * delta , center = True).max() ==\30 copy_data[value_label])[0]31 else:32 max_mask = (copy_data[value_label].rolling(1 + 2 * delta , center = True).min() ==\33 copy_data[value_label])34 indexes = np.where(copy_data[value_label].rolling(1 + 2 * delta , center = True).min() ==\35 copy_data[value_label])[0]3637 indexes_with_nan = np.concatenate([indexes, [None]])3839 # fmxi is first max index (index of the first maximum)40 copy_data['{}_f{}i{}'.format(value_label, mode, 1 + 2 * delta)] =\41 indexes_with_nan[np.searchsorted(indexes, data.index, side='right')]4243 # fmxr is first max range (range to the first maximum)44 copy_data['{}_f{}r{}'.format(value_label, mode, 1 + 2 * delta)] =\45 copy_data['{}_f{}i{}'.format(value_label, mode, 1 + 2 * delta)] - copy_data.index4647 max_val = copy_data[max_mask][[value_label]]48 max_val = max_val.rename(columns = lambda x : "{}_{}{}".format(x, mode, 1 + 2 * delta)) 4950 copy_data = copy_data.join(max_val, how = 'left')5152 # print(copy_data[value_label] == copy_data[value_label+"_max"])53 copy_data.loc[copy_data[value_label] == copy_data['{}_{}{}'.format(value_label, mode, 1 + 2 * delta)], 54 '{}_f{}r{}'.format(value_label, mode, 1 + 2 * delta)] = 05556 copy_data.drop(['{}_f{}i{}'.format(value_label, mode, 1 + 2 * delta)], inplace = True, axis = 1)57 copy_data = copy_data[::-1]58 copy_data['{}_{}{}'.format(value_label, mode, 1 + 2 * delta)] = copy_data[59 '{}_{}{}'.format(value_label, mode, 1 + 2 * delta)].ffill()60 copy_data = copy_data[::-1]6162 return copy_data636465#lag(data, ["date"], ["brent_close", "brent_open"])66# date brent_close brent_open brent_close_daylag_1 brent_open_daylag_1 brent_close_daylag_2 brent_open_daylag_2 brent_close_daylag_3 brent_open_daylag_3 brent_close_daylag_4 brent_open_daylag_4 brent_close_daylag_5 brent_open_daylag_5 brent_close_daylag_12 brent_open_daylag_1267#0 2002-07-01 25.64 25.50 25.75 25.61 25.75 25.61 25.75 25.61 25.75 25.61 25.75 25.61 25.75 25.6168#1 2002-07-02 25.75 25.61 25.84 25.73 25.84 25.73 25.84 25.73 25.84 25.73 25.84 25.73 25.84 25.736970#get_max(data, [2, 3], ['brent_close', 'brent_open'], 'min')71# date brent_close brent_open brent_close_fminr5 brent_close_min5 brent_open_fminr5 brent_open_min5 brent_close_fminr7 brent_close_min7 brent_open_fminr7 brent_open_min772#0 2002-07-01 25.64 25.50 5 25.08 6 25.1 5 25.08 6 25.1
...
modify.py
Source:modify.py
1import sys2def modify_instance(name):3 with open(name, "r") as f:4 data = f.readlines()5 copy_data = data.copy()6 for i,line in enumerate(data):7 for j,c in enumerate(line):8 if c == "<":9 copy_data[i]= copy_data[i][:j]+' '+copy_data[i][j+1:]10 if c == ">":11 copy_data[i]= copy_data[i][:j]+' '+copy_data[i][j+1:]12 if c == ",":13 copy_data[i]= copy_data[i][:j]+' '+copy_data[i][j+1:]14 15 16 with open("modify.dat", "w") as out :17 out.writelines(copy_data)18if __name__ == "__main__":...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!