Best Python code snippet using lettuce_webdriver_python
project_code.py
Source:project_code.py
1import csv2import numpy as np3import timeit4import random5import itertools6import math7from random import randint8from dim_reduction import *9from knn import KNN10from mpp import MPP11from sklearn import svm12from sklearn.metrics import roc_curve13from sklearn import tree14from bpnn import Network15from kmeans import KMeans16from kohonen import KMap17from wta import WTA18import matplotlib as mpl19import matplotlib.pyplot as plt20mpl.use('Qt4Agg')21def filter_retweets(data):22 no_rt = []23 for sample in data:24 retweet = sample[2]25 if retweet == 'False':26 no_rt.append(sample)27 return no_rt28def extract_features(data):29 features = np.zeros((9,len(data)))30 for i in range(0,len(data)):31 tweet = data[i][3]32 upper = 033 for word in tweet.split():34 if word.isupper():35 upper += 136 features[0,i] = tweet.count('!')37 features[1,i] = tweet.lower().count('pic.twitter.com')38 features[2,i] = tweet.count('@')39 features[3,i] = upper40 features[4,i] = tweet.lower().count('http')41 features[5,i] = tweet.count('#')42 features[6,i] = tweet.count('"')43 features[7,i] = tweet.count(',')44 features[8,i] = tweet.count('.')45# features[7,i] = tweet.lower().count('trump') + tweet.lower().count('donald') 46# features[8,i] = tweet.lower().count('maga') + tweet.lower().count('make america great again') + tweet.lower().count('makeamericagreatagain') + tweet.lower().count('make #americagreatagain') + tweet.lower().count('make america') + tweet.lower().count('great again')47 # features[8,i] = tweet.lower().count('loser')48 return features49def nb_fusion(conf_mat, labels, true_labels):50# print(conf_mat.shape)51# print(labels.shape)52# print(labels)53 num_classifiers = conf_mat.shape[0]54 comb = []55 for i in range(0,num_classifiers):56 comb.append(list(range(2)))57 comb = list(itertools.product(*comb))58 num_comb = len(comb)59 table = np.zeros((2,len(comb)))60 num_samples = labels.shape[1]61 num1 = np.count_nonzero(true_labels)62 num0 = num_samples - num163# print('num0:', num0)64# print('num1:', num1)65 for i in range(0,num_comb):66 prob0 = (1/math.pow(num0,num_classifiers-1))67 prob1 = (1/math.pow(num1,num_classifiers-1))68 prod = np.ones((2,1))69 for j in range(0,num_classifiers):70 col = comb[i][j]71 prod = np.multiply(prod, conf_mat[j,:,col].reshape((2,1)))72 prod[0] = prod[0] * prob073 prod[1] = prod[1] * prob174 table[:,i] = prod[:,0]75 fused = np.zeros((num_samples,1))76 for i in range(0,num_samples):77 combination = []78 for j in range(0,num_classifiers):79 combination.append(labels[j][i])80 combination = tuple(combination)81 entry = table[:,comb.index(combination)]82 if entry[0] > entry[1]:83 fused[i] = 084 else:85 fused[i] = 186 return table,comb,fused87def majority_vote(predictions):88 num_classifiers = predictions.shape[0]89 num_samples = predictions.shape[1]90 fused = np.zeros(num_samples)91 for i in range(0, num_samples):92 yes = 093 no = 094 for j in range(0, num_classifiers):95 if predictions[j,i] == 0.0:96 no += 197 else:98 yes += 199 if yes > no:100 fused[i] = 1.0101 else:102 fused[i] = 0.0103 return fused104def standardize(data, mean, sigma):105 for i in range(0, data.shape[1]):106 x = data[:,i].reshape(mean.shape)107 data[:,i] = ((x-mean)/sigma).reshape(x.shape[0])108def perf_eval(predict, true):109 num_samples = predict.shape[0]110 fp = 0111 fn = 0112 tp = 0113 tn = 0114 for i in range(0, num_samples):115 if predict[i] == 0:116 if predict[i] == true[i]:117 tn += 1118 else:119 fn += 1120 else:121 if predict[i] == true[i]:122 tp += 1123 else:124 fp += 1125 return (tp,tn,fn,fp)126def confusion_matrix(predict, true):127 tp,tn,fn,fp = perf_eval(predict, true)128 conf_mat = np.zeros((2,2))129 conf_mat[0,0] = tp130 conf_mat[0,1] = fp131 conf_mat[1,0] = fn132 conf_mat[1,1] = tn133 return conf_mat134def m_fold_cross_validation(tweets, person, m):135 print(len(tweets[0]))136 print(len(tweets[1]))137 print(len(tweets[2]))138 print(len(tweets[3]))139 print(len(tweets[4]))140 print(len(tweets[5]))141 all_tweets = []142 all_tweets.extend(tweets[0])143 all_tweets.extend(tweets[1])144 all_tweets.extend(tweets[2])145 all_tweets.extend(tweets[3])146 all_tweets.extend(tweets[4])147 all_tweets.extend(tweets[5])148 y = [0]*len(all_tweets)149 start = 0150 end = 0151 for i in range(0,person):152 start += len(tweets[i])153 end = start + len(tweets[person])154 print(start)155 print(end)156 for i in range(start, end):157 y[i] = 1.0158 z = list(zip(all_tweets, y))159 random.shuffle(z)160 all_tweets, all_labels = zip(*z)161 num_per_set = int(len(all_tweets)/m)162 all_tweets = all_tweets[0:num_per_set*m]163 all_labels = all_labels[0:num_per_set*m]164 sets = []165 for i in range(0,m):166 start = i*num_per_set167 end = (i+1)*num_per_set168 train_tweets = all_tweets[0:start] + all_tweets[end:]169 train_labels = all_labels[0:start] + all_labels[end:]170 test_tweets = all_tweets[start:end]171 test_labels = all_labels[start:end]172 train = (train_tweets, train_labels)173 test = (test_tweets, test_labels)174 sets.append((train, test))175 return sets176def create_dataset(tweets, person, num_train_tweets, train_percentages, num_test_tweets, test_percentages):177 random.shuffle(tweets[0])178 random.shuffle(tweets[1])179 random.shuffle(tweets[2])180 random.shuffle(tweets[3])181 random.shuffle(tweets[4])182 random.shuffle(tweets[5])183 train_data = []184 test_data = []185 num_train_0 = int(train_percentages[0]*num_train_tweets)186 num_train_1 = int(train_percentages[1]*num_train_tweets)187 num_train_2 = int(train_percentages[2]*num_train_tweets)188 num_train_3 = int(train_percentages[3]*num_train_tweets)189 num_train_4 = int(train_percentages[4]*num_train_tweets)190 num_train_5 = int(train_percentages[5]*num_train_tweets)191 num_test_0 = int(test_percentages[0]*num_test_tweets)192 num_test_1 = int(test_percentages[1]*num_test_tweets)193 num_test_2 = int(test_percentages[2]*num_test_tweets)194 num_test_3 = int(test_percentages[3]*num_test_tweets)195 num_test_4 = int(test_percentages[4]*num_test_tweets)196 num_test_5 = int(test_percentages[5]*num_test_tweets)197 for i in range(0, num_train_0):198 train_data.append(tweets[0][i])199 for i in range(num_train_0, num_train_0+num_test_0):200 test_data.append(tweets[0][i])201 for i in range(0, num_train_1):202 train_data.append(tweets[1][i])203 for i in range(num_train_1, num_train_1+num_test_1):204 test_data.append(tweets[1][i])205 for i in range(0, num_train_2):206 train_data.append(tweets[2][i])207 for i in range(num_train_2, num_train_2+num_test_2):208 test_data.append(tweets[2][i])209 for i in range(0, num_train_3):210 train_data.append(tweets[3][i])211 for i in range(num_train_3, num_train_3+num_test_3):212 test_data.append(tweets[3][i])213 for i in range(0, num_train_4):214 train_data.append(tweets[4][i])215 for i in range(num_train_4, num_train_4+num_test_4):216 test_data.append(tweets[4][i])217 for i in range(0, num_train_5):218 train_data.append(tweets[5][i])219 for i in range(num_train_5, num_train_5+num_test_5):220 test_data.append(tweets[5][i])221 222 train_labels = np.zeros(len(train_data))223 start = int(np.sum(train_percentages[0:person])*num_train_tweets)224 end = int(np.sum(train_percentages[0:person+1])*num_train_tweets)225 for i in range(start, end):226 train_labels[i] = 1227 228 test_labels = np.zeros(len(test_data))229 start = int(np.sum(test_percentages[0:person])*num_test_tweets)230 end = int(np.sum(test_percentages[0:person+1])*num_test_tweets)231 for i in range(start, end):232 test_labels[i] = 1233 return [(train_data, train_labels), (test_data, test_labels)]234def plot_roc(f_rate, t_rate, label_str):235 plt.plot(f_rate, t_rate, label=label_str)236 plt.plot([0,1],[0,1], linestyle='--')237 plt.xlabel('False Positive Rate')238 plt.ylabel('True Positive Rate')239# plt.legend()240def main():241 dt_tweets = []242 hc_tweets = []243 kk_tweets = []244 ndgt_tweets = []245 rd_tweets = []246 sk_tweets = []247 with open('DonaldTrumpDataSet.csv', 'r', encoding='utf8', errors='ignore') as csvfile:248 reader = csv.reader(csvfile, delimiter=',')249 for row in reader:250 dt_tweets.append(row)251 with open('HillaryClintonDataSet.csv', 'r', encoding='utf8', errors='ignore') as csvfile:252 reader = csv.reader(csvfile, delimiter=',')253 for row in reader:254 hc_tweets.append(row)255 with open('KimKardashianDataSet.csv', 'r', encoding='utf8', errors='ignore') as csvfile:256 reader = csv.reader(csvfile, delimiter=',')257 for row in reader:258 kk_tweets.append(row)259 with open('NeildeGrasseTysonDataSet.csv', 'r', encoding='utf8', errors='ignore') as csvfile:260 reader = csv.reader(csvfile, delimiter=',')261 for row in reader:262 ndgt_tweets.append(row)263 with open('RichardDawkinsDataSet.csv', 'r', encoding='utf8', errors='ignore') as csvfile:264 reader = csv.reader(csvfile, delimiter=',')265 for row in reader:266 rd_tweets.append(row)267 with open('ScottKellyDataSet.csv', 'r', encoding='utf8', errors='ignore') as csvfile:268 reader = csv.reader(csvfile, delimiter=',')269 for row in reader:270 sk_tweets.append(row)271 dt_tweets.pop(0)272 hc_tweets.pop(0)273 kk_tweets.pop(0)274 ndgt_tweets.pop(0)275 rd_tweets.pop(0)276 sk_tweets.pop(0)277# print(len(dt_tweets))278# print(len(hc_tweets))279# print(len(kk_tweets))280# print(len(ndgt_tweets))281# print(len(rd_tweets))282# print(len(sk_tweets))283# print(len(dt_tweets) + len(hc_tweets) + len(kk_tweets) + len(ndgt_tweets) + len(rd_tweets) + len(sk_tweets))284 tweets = [dt_tweets, hc_tweets, kk_tweets, ndgt_tweets, rd_tweets, sk_tweets]285 dt_nort_tweets = filter_retweets(dt_tweets)286 hc_nort_tweets = filter_retweets(hc_tweets)287 kk_nort_tweets = filter_retweets(kk_tweets)288 ndgt_nort_tweets = filter_retweets(ndgt_tweets)289 rd_nort_tweets = filter_retweets(rd_tweets)290 sk_nort_tweets = filter_retweets(sk_tweets)291# print(len(dt_nort_tweets) + len(hc_nort_tweets) + len(kk_nort_tweets) + len(ndgt_nort_tweets) + len(rd_nort_tweets) + len(sk_nort_tweets))292 nort_tweets = [dt_nort_tweets, hc_nort_tweets, kk_nort_tweets, ndgt_nort_tweets, rd_nort_tweets, sk_nort_tweets]293# percentages = [0.43, 0.08, 0.26, 0.06, 0.14, 0.03]294 percentages = [0.17, 0.17, 0.17, 0.17, 0.16, 0.16]295 datasets = create_dataset(tweets, 0, 7000, percentages, 500, percentages)296 nort_datasets = create_dataset(nort_tweets, 0, 7000, percentages, 500, percentages)297 train_set = datasets[0][0]298 train_labels = datasets[0][1]299 test_set = datasets[1][0]300 test_labels = datasets[1][1]301 nort_train_set = datasets[0][0]302 nort_train_labels = datasets[0][1]303 nort_test_set = datasets[1][0]304 nort_test_labels = datasets[1][1]305 data = train_set306 true_labels = train_labels307 test_data = test_set308 test_labels = test_labels309 nort_data = nort_train_set310 nort_true_labels = nort_train_labels311 nort_test_data = nort_test_set312 nort_test_labels = nort_test_labels313 314 features = extract_features(data)315 nort_features = extract_features(nort_data)316 test_features = extract_features(test_data)317 test_features2 = test_features318 mean = np.mean(features, axis=1).reshape((features.shape[0],1))319 sigma = np.std(features, axis=1).reshape((features.shape[0],1))320 mean2 = np.mean(nort_features, axis=1).reshape((nort_features.shape[0],1))321 sigma2 = np.std(nort_features, axis=1).reshape((nort_features.shape[0],1))322 standardize(features, mean, sigma)323 standardize(nort_features, mean2, sigma2)324 standardize(test_features, mean, sigma)325 standardize(test_features2, mean2, sigma2)326# fld = FLD()327# fld.setup(features, true_labels)328# features = fld.reduce(features)329# test_features = fld.reduce(test_features)330#331# fld2 = FLD()332# fld2.setup(nort_features, nort_train_labels)333# nort_features = fld.reduce(nort_features)334# test_features2 = fld.reduce(test_features2)335# pca = PCA()336# pca.setup(features, 0.8)337# features = pca.reduce(features)338# test_features = pca.reduce(test_features)339# print(pca.eigenvalues)340#341# pca2 = PCA()342# pca2.setup(nort_features, 0.8)343# nort_features = pca.reduce(nort_features)344# test_features2 = pca.reduce(test_features2)345# print(pca2.eigenvalues)346# print("Decision Tree")347# clf = tree.DecisionTreeClassifier()348# clf.probability = True349# clf.fit(features.T, true_labels)350# ymodel = clf.predict(test_features.T)351# prob = clf.predict_proba(test_features.T)352# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)353# plt.figure()354# plot_roc(fper, tper)355# tp,tn,fn,fp = perf_eval(ymodel, test_labels)356# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))357# print('TP:',tp)358# print('TN:',tn)359# print('FP:',fp)360# print('FN:',fn)361# print("SVM linear")362# clf = svm.SVC(kernel='linear', gamma='auto')363# clf.probability = True364# clf.fit(features.T, true_labels)365# ymodel = clf.predict(test_features.T)366# prob = clf.predict_proba(test_features.T)367# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)368# plt.figure()369# plot_roc(fper, tper)370# tp,tn,fn,fp = perf_eval(ymodel, test_labels)371# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))372# print('TP:',tp)373# print('TN:',tn)374# print('FP:',fp)375# print('FN:',fn)376# print("SVM poly")377# clf = svm.SVC(kernel='poly', gamma='auto')378# clf.probability = True379# clf.fit(features.T, true_labels)380# ymodel = clf.predict(test_features.T)381# prob = clf.predict_proba(test_features.T)382# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)383# plt.figure()384# plot_roc(fper, tper)385# tp,tn,fn,fp = perf_eval(ymodel, test_labels)386# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))387# print('TP:',tp)388# print('TN:',tn)389# print('FP:',fp)390# print('FN:',fn)391#392# print("SVM rbf")393# clf = svm.SVC(kernel='rbf', gamma='auto')394# clf.probability = True395# clf.fit(features.T, true_labels)396# ymodel = clf.predict(test_features.T)397# prob = clf.predict_proba(test_features.T)398# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)399# plt.figure()400# plot_roc(fper, tper)401# tp,tn,fn,fp = perf_eval(ymodel, test_labels)402# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))403# print('TP:',tp)404# print('TN:',tn)405# print('FP:',fp)406# print('FN:',fn)407#408# print("SVM sigmoid")409# clf = svm.SVC(kernel='sigmoid', gamma='auto')410# clf.probability = True411# clf.fit(features.T, true_labels)412# ymodel = clf.predict(test_features.T)413# prob = clf.predict_proba(test_features.T)414# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)415# plt.figure()416# plot_roc(fper, tper)417# tp,tn,fn,fp = perf_eval(ymodel, test_labels)418# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))419# print('TP:',tp)420# print('TN:',tn)421# print('FP:',fp)422# print('FN:',fn)423# k = 3424# print("KNN: k =",k)425# print('2 norm')426# knn_model = KNN(k)427# knn_model.fit(features, true_labels)428# ymodel = knn_model.predict(test_features, norm=2)429# prob = knn_model.predict_prob(test_features)430# print(prob)431# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)432# plt.figure()433# plot_roc(fper, tper)434# tp,tn,fn,fp = perf_eval(ymodel, test_labels)435# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))436# print('TP:',tp)437# print('TN:',tn)438# print('FP:',fp)439# print('FN:',fn)440# knn_model2 = KNN(k)441# knn_model2.fit(nort_features, nort_train_labels)442# ymodel = knn_model2.predict(test_features2, norm=2)443# prob = knn_model2.predict_prob(test_features)444# print(prob)445# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)446# plt.figure()447# plot_roc(fper, tper)448# tp,tn,fn,fp = perf_eval(ymodel, test_labels)449# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))450# print('TP:',tp)451# print('TN:',tn)452# print('FP:',fp)453# print('FN:',fn)454# print('inf norm')455# knn_model = KNN(k)456# knn_model.fit(features, true_labels)457# ymodel = knn_model.predict(test_features, norm='inf')458# prob = knn_model.predict_prob(test_features)459# print(prob)460# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)461# plt.figure()462# plot_roc(fper, tper)463# tp,tn,fn,fp = perf_eval(ymodel, test_labels)464# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))465# print('TP:',tp)466# print('TN:',tn)467# print('FP:',fp)468# print('FN:',fn)469#470# knn_model2 = KNN(k)471# knn_model2.fit(nort_features, nort_train_labels)472# ymodel = knn_model2.predict(test_features2, norm='inf')473# prob = knn_model2.predict_prob(test_features)474# print(prob)475# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)476# plt.figure()477# plot_roc(fper, tper)478# tp,tn,fn,fp = perf_eval(ymodel, test_labels)479# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))480# print('TP:',tp)481# print('TN:',tn)482# print('FP:',fp)483# print('FN:',fn)484#485# print('1 norm')486# knn_model = KNN(k)487# knn_model.fit(features, true_labels)488# ymodel = knn_model.predict(test_features, norm=1)489# prob = knn_model.predict_prob(test_features)490# print(prob)491# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)492# plt.figure()493# plot_roc(fper, tper)494# tp,tn,fn,fp = perf_eval(ymodel, test_labels)495# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))496# print('TP:',tp)497# print('TN:',tn)498# print('FP:',fp)499# print('FN:',fn)500#501# knn_model2 = KNN(k)502# knn_model2.fit(nort_features, nort_train_labels)503# ymodel = knn_model2.predict(test_features2, norm=1)504# prob = knn_model.predict_prob(test_features)505# print(prob)506# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)507# plt.figure()508# plot_roc(fper, tper)509# tp,tn,fn,fp = perf_eval(ymodel, test_labels)510# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))511# print('TP:',tp)512# print('TN:',tn)513# print('FP:',fp)514# print('FN:',fn)515# true = np.count_nonzero(true_labels)/true_labels.shape[0]516# false = 1-true517# print("MPP case 1")518# mpp = MPP(1)519# mpp.set_prior(false, true)520# mpp.fit(features, true_labels)521# mpp_pred1 = mpp.predict(test_features)522# prob = mpp.predict_prob(test_features)523# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)524# plt.figure()525# plot_roc(fper, tper)526# tp,tn,fn,fp = perf_eval(mpp_pred1, test_labels)527# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))528# print('TP:',tp)529# print('TN:',tn)530# print('FP:',fp)531# print('FN:',fn)532#533# print("MPP case 2")534# mpp = MPP(2)535# mpp.set_prior(false, true)536# mpp.fit(features, true_labels)537# mpp_pred2 = mpp.predict(test_features)538# prob = mpp.predict_prob(test_features)539# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)540# plt.figure()541# plot_roc(fper, tper)542# tp,tn,fn,fp = perf_eval(mpp_pred2, test_labels)543# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))544# print('TP:',tp)545# print('TN:',tn)546# print('FP:',fp)547# print('FN:',fn)548#549# print("MPP case 3")550# mpp = MPP(3)551# mpp.set_prior(false, true)552# mpp.fit(features, true_labels)553# mpp_pred3 = mpp.predict(test_features)554# prob = mpp.predict_prob(test_features)555# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)556# plt.figure()557# plot_roc(fper, tper)558# tp,tn,fn,fp = perf_eval(mpp_pred3, test_labels)559# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))560# print('TP:',tp)561# print('TN:',tn)562# print('FP:',fp)563# print('FN:',fn)564#565# print("Fused MPP")566# mpp_predictions = np.zeros((3,mpp_pred1.shape[0]))567# mpp_predictions[0,:] = mpp_pred1.T568# mpp_predictions[1,:] = mpp_pred2.T569# mpp_predictions[2,:] = mpp_pred3.T570# mpp_fused = majority_vote(mpp_predictions)571# tp,tn,fn,fp = perf_eval(mpp_fused, test_labels)572# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))573# print('TP:',tp)574# print('TN:',tn)575# print('FP:',fp)576# print('FN:',fn)577# print("BPNN")578# num_features = 7579# net = Network([features.shape[0], 10, 2])580# net.SGD(features, true_labels, 1000, 1, 0.05, test_features, test_labels)581# prob = net.SGD_prob(features, true_labels, 100, 1, 0.10, test_features, test_labels)582# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)583# plt.figure()584# plot_roc(fper, tper)585 #plt.show()586# kmeans = KMeans(2)587# kmeans.predict(train_features, train_labels)588# kmeans.predict(test_features, test_labels)589# kmap.predict(test_features, test_labels, e=0.0000001, iters=1000)590# wta = WTA(2)591# wta.predict(test_features, test_labels, e=0.01)592# kmap = KMap(2)593# kmap.predict(test_features, test_labels, e=0.001, iters=100)594# kmap.predict(test_features, test_labels, e=0.0000001, iters=1000)595# m = 5596# sets = m_fold_cross_validation(tweets, 0, m)597# print(len(sets))598# conf_mats = np.zeros((m,2,2))599# for i in range(0,m):600# train,test = sets[i]601# train_tweets,train_labels = train602# test_tweets,test_labels = test603# train_features = extract_features(train_tweets)604# test_features = extract_features(test_tweets)605# mean = np.mean(train_features, axis=1).reshape((train_features.shape[0],1))606# sigma = np.std(train_features, axis=1).reshape((train_features.shape[0],1))607# standardize(train_features, mean, sigma)608# standardize(test_features, mean, sigma)609# print("BGNN")610# net = Network([train_features.shape[0], 10, 2])611# conf_mats[i,:,:] = net.SGD(train_features, train_labels, 1000, 1, 0.05, test_features, test_labels)612# kmap.predict(test_features, test_labels, e=0.0000001, iters=1000)613 m = 10614 sets = m_fold_cross_validation(tweets, 0, m)615 print(len(sets))616 num_test = len(sets[0][0][1])617 print(num_test)618 for i in range(0,1):619 print('Set', i)620 train,test = sets[i]621 train_tweets,train_labels = train622 test_tweets,test_labels = test623 percentages = [0.43, 0.08, 0.26, 0.06, 0.14, 0.03]624 datasets = create_dataset(tweets, 0, 10000, percentages, 1000, percentages)625 train_tweets = datasets[0][0]626 train_labels = datasets[0][1]627 test_tweets = datasets[1][0]628 test_labels = datasets[1][1]629 630 train_features = extract_features(train_tweets)631 test_features = extract_features(test_tweets)632 mean = np.mean(train_features, axis=1).reshape((train_features.shape[0],1))633 sigma = np.std(train_features, axis=1).reshape((train_features.shape[0],1))634 standardize(train_features, mean, sigma)635 standardize(test_features, mean, sigma)636# print(len(test_labels))637# fld = FLD()638# fld.setup(train_features, train_labels)639# train_features = fld.reduce(train_features)640# test_features = fld.reduce(test_features)641 642# pca = PCA()643# pca.setup(train_features, 0.8)644# train_features = pca.reduce(train_features)645# test_features = pca.reduce(test_features)646# print(pca.eigenvalues)647 648# print(len(test_labels))649# num_test = len(test_labels)650# conf_mats = np.zeros((3,2,2))651# all_labels = np.zeros((3,num_test))652#653# print("Decision Tree")654# clf = tree.DecisionTreeClassifier()655# clf.probability = True656# clf.fit(train_features.T, train_labels)657# ymodel = clf.predict(test_features.T)658# prob = clf.predict_proba(test_features.T)659# print(prob[0:10])660# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)661## plt.figure()662## plot_roc(fper, tper)663# tp,tn,fn,fp = perf_eval(ymodel, test_labels)664# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))665# print('TP:',tp)666# print('TN:',tn)667# print('FP:',fp)668# print('FN:',fn)669# conf_mats[0,:,:] = confusion_matrix(ymodel, test_labels).T670# all_labels[0,:] = ymodel671# fld = FLD()672# fld.setup(train_features, train_labels)673# fld_train_features = fld.reduce(train_features)674# fld_test_features = fld.reduce(test_features)675#676# clf.fit(fld_train_features.T, train_labels)677# ymodel = clf.predict(fld_test_features.T)678# prob = clf.predict_proba(fld_test_features.T)679# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)680# plot_roc(fper, tper, 'FLD')681# tp,tn,fn,fp = perf_eval(ymodel, test_labels)682# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))683# print('TP:',tp)684# print('TN:',tn)685# print('FP:',fp)686# print('FN:',fn)687# kohonen_sens[0] = tp/(tp+fn)688# kohonen_spec[0] = tn/(tn+fp)689# kohonen_pred = kohonen.predict(test_features, test_labels,e=0.01, iters=100, norm=1)690# kohonen_pred = np.array(kohonen_pred)691# tp,tn,fn,fp = perf_eval(kohonen_pred, test_labels)692# print('KMap: INF')693# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))694# print('TP:',tp)695# print('TN:',tn)696# print('FP:',fp)697# print('FN:',fn)698# kohonen_sens[1] = tp/(tp+fn)699# kohonen_spec[1] = tn/(tn+fp)700# kohonen_pred = kohonen.predict(test_features, test_labels,e=0.01, iters=100, norm=2)701# kohonen_pred = np.array(kohonen_pred)702# tp,tn,fn,fp = perf_eval(kohonen_pred, test_labels)703# print('KMap: INF')704#705# pca = PCA()706# tol = 0.75707# pca.setup(train_features, tol)708# pca_train_features = pca.reduce(train_features)709# pca_test_features = pca.reduce(test_features)710#711# clf.fit(pca_train_features.T, train_labels)712# ymodel = clf.predict(pca_test_features.T)713# prob = clf.predict_proba(pca_test_features.T)714# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)715# plot_roc(fper, tper, 'PCA: tol='+str(tol))716# tp,tn,fn,fp = perf_eval(ymodel, test_labels)717# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))718# print('TP:',tp)719# print('TN:',tn)720# print('FP:',fp)721# print('FN:',fn)722# kohonen_sens[2] = tp/(tp+fn)723# kohonen_spec[2] = tn/(tn+fp)724# knn_sens = np.zeros((3,1))725# knn_spec = np.zeros((3,1))726# k = 5727# print("KNN: k =",k)728# print('2 norm')729# knn_model = KNN(k)730# knn_model.fit(train_features, train_labels)731# ymodel = knn_model.predict(test_features, norm='inf')732# tp,tn,fn,fp = perf_eval(ymodel, test_labels)733# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))734# print('TP:',tp)735# print('TN:',tn)736# print('FP:',fp)737# print('FN:',fn)738# knn_sens[0] = tp/(tp+fn)739# knn_spec[0] = tn/(tn+fp)740# ymodel = knn_model.predict(test_features, norm=1)741# tp,tn,fn,fp = perf_eval(ymodel, test_labels)742# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))743# print('TP:',tp)744# print('TN:',tn)745# print('FP:',fp)746# print('FN:',fn)747# knn_sens[1] = tp/(tp+fn)748# knn_spec[1] = tn/(tn+fp)749# ymodel = knn_model.predict(test_features, norm=2)750# tp,tn,fn,fp = perf_eval(ymodel, test_labels)751# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))752# print('TP:',tp)753# print('TN:',tn)754# print('FP:',fp)755# print('FN:',fn)756# knn_sens[2] = tp/(tp+fn)757# knn_spec[2] = tn/(tn+fp)758# conf_mats[0,:,:] = confusion_matrix(ymodel, test_labels).T759# all_labels[0,:] = ymodel760## Minkowski distance761# kmeans_sens = np.zeros((3,1))762# kmeans_spec = np.zeros((3,1))763# kmeans = KMeans(2)764# kpred = kmeans.predict(test_features, test_labels, norm=np.inf)765# kpred = np.array(kpred)766# tp,tn,fn,fp = perf_eval(kpred, test_labels)767# print('KMeans: INF')768# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))769# print('TP:',tp)770# print('TN:',tn)771# print('FP:',fp)772# print('FN:',fn)773# kmeans_sens[0] = tp/(tp+fn)774# kmeans_spec[0] = tn/(tn+fp)775# kpred = kmeans.predict(test_features, test_labels,1)776# kpred = np.array(kpred)777# tp,tn,fn,fp = perf_eval(kpred, test_labels)778# print('KMeans: 1')779# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))780# print('TP:',tp)781# print('TN:',tn)782# print('FP:',fp)783# print('FN:',fn)784# kmeans_sens[1] = tp/(tp+fn)785# kmeans_spec[1] = tn/(tn+fp)786# kpred = kmeans.predict(test_features, test_labels,2)787# kpred = np.array(kpred)788# tp,tn,fn,fp = perf_eval(kpred, test_labels)789# print('KMeans: 2')790# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))791# print('TP:',tp)792# print('TN:',tn)793# print('FP:',fp)794# print('FN:',fn)795# kmeans_sens[2] = tp/(tp+fn)796# kmeans_spec[2] = tn/(tn+fp)797#798# wta_sens = np.zeros((3,1))799# wta_spec = np.zeros((3,1))800# wta = WTA(2)801# wta_pred = wta.predict(test_features, test_labels,e=0.01, norm=np.inf)802# wta_pred = np.array(wta_pred)803# tp,tn,fn,fp = perf_eval(wta_pred, test_labels)804# print('WTA: INF')805# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))806# print('TP:',tp)807# print('TN:',tn)808# print('FP:',fp)809# print('FN:',fn)810# wta_sens[0] = tp/(tp+fn)811# wta_spec[0] = tn/(tn+fp)812# wta_pred = wta.predict(test_features, test_labels,e=0.01, norm=1)813# wta_pred = np.array(wta_pred)814# tp,tn,fn,fp = perf_eval(wta_pred, test_labels)815# print('WTA: 1')816# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))817# print('TP:',tp)818# print('TN:',tn)819# print('FP:',fp)820# print('FN:',fn)821# wta_sens[1] = tp/(tp+fn)822# wta_spec[1] = tn/(tn+fp)823# wta_pred = wta.predict(test_features, test_labels,e=0.01, norm=2)824# wta_pred = np.array(wta_pred)825# tp,tn,fn,fp = perf_eval(wta_pred, test_labels)826# print('WTA: 2')827# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))828# print('TP:',tp)829# print('TN:',tn)830# print('FP:',fp)831# print('FN:',fn)832# wta_sens[2] = tp/(tp+fn)833# wta_spec[2] = tn/(tn+fp)834#835# kohonen_sens = np.zeros((3,1))836# kohonen_spec = np.zeros((3,1))837# kohonen = KMap(2)838# kohonen_pred = kohonen.predict(test_features, test_labels,e=0.01, iters=100, norm=np.inf)839# kohonen_pred = np.array(kohonen_pred)840# tp,tn,fn,fp = perf_eval(kohonen_pred, test_labels)841# print('KMap: INF')842# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))843# print('TP:',tp)844# print('TN:',tn)845# print('FP:',fp)846# print('FN:',fn)847# kohonen_sens[0] = tp/(tp+fn)848# kohonen_spec[0] = tn/(tn+fp)849# kohonen_pred = kohonen.predict(test_features, test_labels,e=0.01, iters=100, norm=1)850# kohonen_pred = np.array(kohonen_pred)851# tp,tn,fn,fp = perf_eval(kohonen_pred, test_labels)852# print('KMap: 1')853# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))854# print('TP:',tp)855# print('TN:',tn)856# print('FP:',fp)857# print('FN:',fn)858# kohonen_sens[1] = tp/(tp+fn)859# kohonen_spec[1] = tn/(tn+fp)860# kohonen_pred = kohonen.predict(test_features, test_labels,e=0.01, iters=100, norm=2)861# kohonen_pred = np.array(kohonen_pred)862# tp,tn,fn,fp = perf_eval(kohonen_pred, test_labels)863# print('KMap: 2')864# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))865# print('TP:',tp)866# print('TN:',tn)867# print('FP:',fp)868# print('FN:',fn)869# kohonen_sens[2] = tp/(tp+fn)870# kohonen_spec[2] = tn/(tn+fp)871#872# knn_sens = np.zeros((3,1))873# knn_spec = np.zeros((3,1))874# k = 5875# print("KNN: k =",k)876# knn_model = KNN(k)877# knn_model.fit(train_features, train_labels)878# ymodel = knn_model.predict(test_features, norm='inf')879# tp,tn,fn,fp = perf_eval(ymodel, test_labels)880# print("KNN INF")881# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))882# print('TP:',tp)883# print('TN:',tn)884# print('FP:',fp)885# print('FN:',fn)886# knn_sens[0] = tp/(tp+fn)887# knn_spec[0] = tn/(tn+fp)888# ymodel = knn_model.predict(test_features, norm=1)889# tp,tn,fn,fp = perf_eval(ymodel, test_labels)890# print("KNN 1")891# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))892# print('TP:',tp)893# print('TN:',tn)894# print('FP:',fp)895# print('FN:',fn)896# knn_sens[1] = tp/(tp+fn)897# knn_spec[1] = tn/(tn+fp)898# ymodel = knn_model.predict(test_features, norm=2)899# tp,tn,fn,fp = perf_eval(ymodel, test_labels)900# print("KNN 2")901# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))902# print('TP:',tp)903# print('TN:',tn)904# print('FP:',fp)905# print('FN:',fn)906# knn_sens[2] = tp/(tp+fn)907# knn_spec[2] = tn/(tn+fp)908#909# width = 0.20910# plt.figure()911# plt.xticks([0,1,2], ['INF', '1', '2'])912# plt.xlabel('Norm')913# plt.ylabel('Sensitivity')914# plt.title('Sensitivity with different norms')915# plt.bar(np.arange(3)-(3/2)*width, knn_sens[:,0], width=width)916# plt.bar(np.arange(3)-width/2, kmeans_sens[:,0], width=width)917# plt.bar(np.arange(3)+width/2, wta_sens[:,0], width=width)918# plt.bar(np.arange(3)+(3/2)*width, kohonen_sens[:,0], width=width)919# plt.legend(['KNN', 'KMeans', 'WTA', 'Kohonen'])920# plt.figure()921# plt.xticks([0,1,2], ['INF', '1', '2'])922# plt.xlabel('Norm')923# plt.ylabel('Specificity')924# plt.title('Specificity with different norms')925# plt.bar(np.arange(3)-(3/2)*width, knn_spec[:,0], width=width)926# plt.bar(np.arange(3)-width/2, kmeans_spec[:,0], width=width)927# plt.bar(np.arange(3)+width/2, wta_spec[:,0], width=width)928# plt.bar(np.arange(3)+(3/2)*width, kohonen_spec[:,0], width=width)929# plt.legend(['KNN', 'KMeans', 'WTA', 'Kohonen'])930# plt.show()931# conf_mats[1,:,:] = confusion_matrix(kpred, test_labels).T932# all_labels[1,:] = kpred933# k = 3934# print("KNN: k =",k)935# print('2 norm')936# knn_model = KNN(k)937# knn_model.fit(train_features, train_labels)938# ymodel = knn_model.predict(test_features, norm=2)939# prob = knn_model.predict_prob(test_features)940# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)941# plt.figure()942# plot_roc(fper, tper)943# tp,tn,fn,fp = perf_eval(ymodel, test_labels)944# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))945# print('TP:',tp)946# print('TN:',tn)947# print('FP:',fp)948# print('FN:',fn)949# conf_mats[2,:,:] = confusion_matrix(ymodel, test_labels).T950# all_labels[2,:] = ymodel951# wta = WTA(2)952# wta.predict(test_features, test_labels, e=0.01)953# kmap = KMap(2)954# kmap.predict(test_features, test_labels, e=0.001, iters=100)955#956# predictions = np.zeros((2,ymodel.shape[0]))957# predictions[0,:] = ymodel.T958# fused = majority_vote(predictions)959# tp,tn,fn,fp = perf_eval(fused, test_labels)960# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))961# print('TP:',tp)962# print('TN:',tn)963# print('FP:',fp)964# print('FN:',fn)965 net = Network([features.shape[0], 10, 2])966# conf, ymodel = net.SGD(train_features, train_labels, 1000, 1, 0.05, test_features, test_labels)967 prob = net.SGD_prob(train_features, train_labels, 100, 1, 0.10, test_features, test_labels)968 fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)969 plot_roc(fper, tper, 'Standard')970# tp,tn,fn,fp = perf_eval(np.array(ymodel), test_labels)971# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))972# print('TP:',tp)973# print('TN:',tn)974# print('FP:',fp)975# print('FN:',fn)976 fld = FLD()977 fld.setup(train_features, train_labels)978 fld_train_features = fld.reduce(train_features)979 fld_test_features = fld.reduce(test_features)980 net = Network([fld_train_features.shape[0], 10, 2])981# conf, ymodel = net.SGD(fld_train_features, train_labels, 1000, 1, 0.05, fld_test_features, test_labels)982 prob = net.SGD_prob(fld_train_features, train_labels, 100, 1, 0.10, fld_test_features, test_labels)983 fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)984 plot_roc(fper, tper, 'FLD')985# tp,tn,fn,fp = perf_eval(np.array(ymodel), test_labels)986# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))987# print('TP:',tp)988# print('TN:',tn)989# print('FP:',fp)990# print('FN:',fn)991 pca = PCA()992 tol = 0.75993 pca.setup(train_features, tol)994 pca_train_features = pca.reduce(train_features)995 pca_test_features = pca.reduce(test_features)996 net = Network([pca_train_features.shape[0], 10, 2])997# conf, ymodel = net.SGD(pca_train_features, train_labels, 1000, 1, 0.05, pca_test_features, test_labels)998 prob = net.SGD_prob(pca_train_features, train_labels, 100, 1, 0.10, pca_test_features, test_labels)999 fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1000 plot_roc(fper, tper, 'PCA')1001# tp,tn,fn,fp = perf_eval(np.array(ymodel), test_labels)1002# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1003# print('TP:',tp)1004# print('TN:',tn)1005# print('FP:',fp)1006# print('FN:',fn)1007 plt.title('BPNN: with different dimension reduction techniques')1008 plt.legend()1009# print("SVM rbf")1010# clf = svm.SVC(kernel='rbf', gamma='auto')1011# clf.probability = True1012# clf.fit(train_features.T, train_labels)1013# ymodel = clf.predict(test_features.T)1014# prob = clf.predict_proba(test_features.T)1015# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1016# plot_roc(fper, tper, 'Standard')1017# tp,tn,fn,fp = perf_eval(ymodel, test_labels)1018# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1019# print('TP:',tp)1020# print('TN:',tn)1021# print('FP:',fp)1022# print('FN:',fn)1023#1024# fld = FLD()1025# fld.setup(train_features, train_labels)1026# fld_train_features = fld.reduce(train_features)1027# fld_test_features = fld.reduce(test_features)1028#1029# print("SVM rbf")1030# clf = svm.SVC(kernel='rbf', gamma='auto')1031# clf.probability = True1032# clf.fit(fld_train_features.T, train_labels)1033# ymodel = clf.predict(fld_test_features.T)1034# prob = clf.predict_proba(fld_test_features.T)1035# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1036# plot_roc(fper, tper, 'FLD')1037# tp,tn,fn,fp = perf_eval(ymodel, test_labels)1038# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1039# print('TP:',tp)1040# print('TN:',tn)1041# print('FP:',fp)1042# print('FN:',fn)1043#1044# pca = PCA()1045# tol = 0.751046# pca.setup(train_features, tol)1047# pca_train_features = pca.reduce(train_features)1048# pca_test_features = pca.reduce(test_features)1049#1050# print("SVM rbf")1051# clf = svm.SVC(kernel='rbf', gamma='auto')1052# clf.probability = True1053# clf.fit(pca_train_features.T, train_labels)1054# ymodel = clf.predict(pca_test_features.T)1055# prob = clf.predict_proba(pca_test_features.T)1056# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1057# plot_roc(fper, tper, 'PCA')1058# tp,tn,fn,fp = perf_eval(ymodel, test_labels)1059# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1060# print('TP:',tp)1061# print('TN:',tn)1062# print('FP:',fp)1063# print('FN:',fn)1064# plt.title("SVM with rbf kernel")1065 1066# print("SVM linear")1067# clf = svm.SVC(kernel='linear', gamma='auto')1068# clf.probability = True1069# clf.fit(train_features.T, train_labels)1070# ymodel = clf.predict(test_features.T)1071# prob = clf.predict_proba(test_features.T)1072# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1073# plot_roc(fper, tper, 'linear')1074# tp,tn,fn,fp = perf_eval(ymodel, test_labels)1075# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1076# print('TP:',tp)1077# print('TN:',tn)1078# print('FP:',fp)1079# print('FN:',fn)1080# 1081# print("SVM poly")1082# clf = svm.SVC(kernel='poly', gamma='auto')1083# clf.probability = True1084# clf.fit(train_features.T, train_labels)1085# ymodel = clf.predict(test_features.T)1086# prob = clf.predict_proba(test_features.T)1087# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1088# plot_roc(fper, tper, 'poly')1089# tp,tn,fn,fp = perf_eval(ymodel, test_labels)1090# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1091# print('TP:',tp)1092# print('TN:',tn)1093# print('FP:',fp)1094# print('FN:',fn)1095# 1096# print("SVM rbf")1097# clf = svm.SVC(kernel='rbf', gamma='auto')1098# clf.probability = True1099# clf.fit(train_features.T, train_labels)1100# ymodel = clf.predict(test_features.T)1101# prob = clf.predict_proba(test_features.T)1102# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1103# plot_roc(fper, tper, 'rbf')1104# tp,tn,fn,fp = perf_eval(ymodel, test_labels)1105# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1106# print('TP:',tp)1107# print('TN:',tn)1108# print('FP:',fp)1109# print('FN:',fn)1110# 1111# print("SVM sigmoid")1112# clf = svm.SVC(kernel='sigmoid', gamma='auto')1113# clf.probability = True1114# clf.fit(train_features.T, train_labels)1115# ymodel = clf.predict(test_features.T)1116# prob = clf.predict_proba(test_features.T)1117# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1118# plot_roc(fper, tper, 'sigmoid')1119# tp,tn,fn,fp = perf_eval(ymodel, test_labels)1120# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1121# print('TP:',tp)1122# print('TN:',tn)1123# print('FP:',fp)1124# print('FN:',fn)1125#1126# plt.title("SVM comparison with different kernels")1127# k = 11128# print("KNN: k =",k)1129# print('2 norm')1130# knn_model = KNN(k)1131# knn_model.fit(train_features, train_labels)1132# ymodel = knn_model.predict(test_features, norm=2)1133# prob = knn_model.predict_prob(test_features)1134# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1135# plt.figure()1136# plt.plot(fper, tper, label='k = '+str(k))1137# plt.plot([0,1],[0,1], linestyle='--')1138# plt.xlabel('False Positive Rate')1139# plt.ylabel('True Positive Rate')1140# plt.title('KNN k=1 ROC curve')1141# tp,tn,fn,fp = perf_eval(ymodel, test_labels)1142# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1143# print('TP:',tp)1144# print('TN:',tn)1145# print('FP:',fp)1146# print('FN:',fn)1147# total_k = int(math.sqrt(len(train_labels)))1148# x = list(range(1,total_k))1149# k_sensitivity = np.zeros((len(x)+1,1))1150# k_specificity = np.zeros((len(x)+1,1))1151# for k in range(1,total_k):1152# k_model = knn_model.predict_k(k)1153# tp,tn,fn,fp = perf_eval(k_model, test_labels)1154# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1155# print('TP:',tp)1156# print('TN:',tn)1157# print('FP:',fp)1158# print('FN:',fn)1159# k_sensitivity[k] = tp/(tp+fn)1160# k_specificity[k] = tn/(tn+fp)1161#1162# plt.figure()1163# plt.plot(x, k_sensitivity[1:], label='Sensitivity')1164# plt.xlabel("k")1165# plt.ylabel("Sensitivity")1166# plt.title("Sensitivity")1167# plt.plot(x, k_specificity[1:], label='Specificity')1168# plt.xlabel("k")1169# plt.ylabel("Specificity")1170# plt.title("Specificity")1171# plt.title("KNN Performance")1172# plt.legend()1173# print("BPNN")1174# net = Network([train_features.shape[0], 10, 10, 2])1175# conf_mats[0,:,:],bpnn_pred = net.SGD(train_features, train_labels, 1000, 1, 0.05, test_features, test_labels)1176# all_labels[0,:] = np.array(bpnn_pred)1177# prob = net.SGD_prob(train_features, train_labels, 100, 1, 0.10, test_features, test_labels)1178# fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1179# plt.figure()1180# plot_roc(fper, tper)1181# plt.show()1182# sensitivity = np.zeros((6,9))1183# specificity = np.zeros((6,9))1184# for j in range(1,10):1185# num_test = len(test_labels)1186# conf_mats = np.zeros((4,2,2))1187# all_labels = np.zeros((4,num_test))1188## plt.figure()1189#1190# train_labels = np.array(train_labels)1191# test_labels = np.array(test_labels)1192## true = np.count_nonzero(true_labels)/true_labels.shape[0]1193## false = 1-true1194# true = j*0.11195# false = 1-true1196# print("Prior 0", false, "Prior 1", true)1197#1198# print("MPP case 1")1199# mpp1 = MPP(1)1200# mpp1.set_prior(false, true)1201# mpp1.fit(train_features, train_labels)1202# mpp_pred1 = mpp1.predict(test_features)1203# prob1 = mpp1.predict_prob(test_features)1204## print(prob1[0:10])1205## fper1, tper1, thresh = roc_curve(test_labels, prob1[:,1], pos_label=1)1206## print(fper1)1207## plot_roc(fper1, tper1, 'Case 1')1208## plt.plot(fper1, tper1, label='Case 1')1209# tp,tn,fn,fp = perf_eval(mpp_pred1, test_labels)1210# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1211# print('TP:',tp)1212# print('TN:',tn)1213# print('FP:',fp)1214# print('FN:',fn)1215# sensitivity[0,j-1] = tp/(tp+fn)1216# specificity[0,j-1] = tn/(tn+fp)1217#1218# conf_mats[0,:,:] = confusion_matrix(mpp_pred1, test_labels).T1219# all_labels[0,:] = mpp_pred1.reshape((mpp_pred1.shape[0]))1220#1221# print("MPP case 2")1222# mpp2 = MPP(2)1223# mpp2.set_prior(false, true)1224# mpp2.fit(train_features, train_labels)1225# mpp_pred2 = mpp2.predict(test_features)1226## prob2 = mpp2.predict_prob(test_features)1227## fper2, tper2, thresh = roc_curve(test_labels, prob2[:,1], pos_label=1)1228## plot_roc(fper2, tper2, 'Case 2')1229## plt.plot(fper2, tper2, label='Case 2')1230# tp,tn,fn,fp = perf_eval(mpp_pred2, test_labels)1231# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1232# print('TP:',tp)1233# print('TN:',tn)1234# print('FP:',fp)1235# print('FN:',fn)1236# sensitivity[1,j-1] = tp/(tp+fn)1237# specificity[1,j-1] = tn/(tn+fp)1238#1239# conf_mats[1,:,:] = confusion_matrix(mpp_pred2, test_labels).T1240# all_labels[1,:] = mpp_pred2.reshape((mpp_pred2.shape[0]))1241#1242# print("MPP case 3")1243# mpp3 = MPP(3)1244# mpp3.set_prior(false, true)1245# mpp3.fit(train_features, train_labels)1246# mpp_pred3 = mpp3.predict(test_features)1247## prob3 = mpp3.predict_prob(test_features)1248## fper3, tper3, thresh = roc_curve(test_labels, prob3[:,1], pos_label=1)1249## plot_roc(fper3, tper3, 'Case 3')1250## plt.plot(fper3, tper3, label='Case 3')1251# tp,tn,fn,fp = perf_eval(mpp_pred3, test_labels)1252# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1253# print('TP:',tp)1254# print('TN:',tn)1255# print('FP:',fp)1256# print('FN:',fn)1257# sensitivity[2,j-1] = tp/(tp+fn)1258# specificity[2,j-1] = tn/(tn+fp)1259#1260# conf_mats[2,:,:] = confusion_matrix(mpp_pred3, test_labels).T1261# all_labels[2,:] = mpp_pred3.reshape((mpp_pred2.shape[0]))1262#1263# k = 51264# print("KNN: k =",k)1265# print('2 norm')1266# knn_model = KNN(k)1267# knn_model.set_prior(false, true)1268# knn_model.fit(train_features, train_labels)1269# ymodel = knn_model.predict(test_features, norm=2)1270## prob = knn_model.predict_prob(test_features)1271## fper, tper, thresh = roc_curve(test_labels, prob[:,1], pos_label=1)1272## plt.figure()1273## plot_roc(fper, tper)1274## plt.plot(fper3, tper3, label='KNN k=3')1275# tp,tn,fn,fp = perf_eval(ymodel, test_labels)1276# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1277# print('TP:',tp)1278# print('TN:',tn)1279# print('FP:',fp)1280# print('FN:',fn)1281# sensitivity[3,j-1] = tp/(tp+fn)1282# specificity[3,j-1] = tn/(tn+fp)1283#1284# conf_mats[3,:,:] = confusion_matrix(ymodel, test_labels).T1285# all_labels[3,:] = ymodel.reshape((mpp_pred2.shape[0]))1286#1287## plt.xlabel('False Positive Rate')1288## plt.ylabel('True Positive Rate')1289## plt.legend()1290## plt.title('MPP: Prior 0: ' + str(round(false, 1)) + ' Prior 1: ' + str(round(true, 1)))1291#1292# print("Majority Vote Fused MPP")1293# mpp_predictions = np.zeros((6,mpp_pred1.shape[0]))1294# mpp_predictions[0,:] = mpp_pred1.T1295# mpp_predictions[1,:] = mpp_pred2.T1296# mpp_predictions[2,:] = mpp_pred3.T1297# mpp_predictions[3,:] = ymodel.T1298# mpp_fused = majority_vote(mpp_predictions)1299# tp,tn,fn,fp = perf_eval(mpp_fused, test_labels)1300# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1301# print('TP:',tp)1302# print('TN:',tn)1303# print('FP:',fp)1304# print('FN:',fn)1305# sensitivity[4,j-1] = tp/(tp+fn)1306# specificity[4,j-1] = tn/(tn+fp)1307#1308## table,comb = nb_fusion(conf_mats, all_labels, test_labels)1309## print(conf_mats)1310## print('table',table)1311#1312# print(conf_mats)1313#1314# print("NB Fusion")1315# table,comb,fused = nb_fusion(conf_mats, all_labels, test_labels)1316# tp,tn,fn,fp = perf_eval(fused, test_labels)1317# print('Accuracy: ', (tp+tn)/(tp+tn+fp+fn))1318# print('TP:',tp)1319# print('TN:',tn)1320# print('FP:',fp)1321# print('FN:',fn)1322# sensitivity[5,j-1] = tp/(tp+fn)1323# specificity[5,j-1] = tn/(tn+fp)1324#1325# plt.figure()1326# x = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]1327# plt.plot(x, sensitivity[0,:], label="Case 1")1328# plt.plot(x, sensitivity[1,:], label="Case 2")1329# plt.plot(x, sensitivity[2,:], label="Case 3")1330# plt.plot(x, sensitivity[3,:], label="KNN")1331# plt.plot(x, sensitivity[4,:], label="Majority Vote")1332# plt.plot(x, sensitivity[5,:], label="NB Fusion")1333# plt.xlabel("Prior probability of correct classification")1334# plt.ylabel("Sensitivity")1335# plt.title("Sensitivity")1336# plt.figure()1337# plt.plot(x, specificity[0,:], label="Case 1")1338# plt.plot(x, specificity[1,:], label="Case 2")1339# plt.plot(x, specificity[2,:], label="Case 3")1340# plt.plot(x, specificity[3,:], label="KNN")1341# plt.plot(x, specificity[4,:], label="Majority Vote")1342# plt.plot(x, specificity[5,:], label="NB Fusion")1343# plt.xlabel("Prior probability of correct classification")1344# plt.ylabel("Specificity")1345# plt.title("Specificity")1346# plt.legend()1347 plt.show()1348if __name__ == "__main__":...
learn_models.py
Source:learn_models.py
1import lime2import sklearn3import numpy as np4import embedding_forest5import sklearn6import sklearn.ensemble7import sklearn.metrics8import sklearn.feature_extraction9import csv10import random11from sklearn.datasets import fetch_20newsgroups12from sklearn.externals import joblib13import itertools14import lstm15import json16import collections17def clean_vectors_wordlist(input_vectors, vectorizer, wordlist):18 ret_vectors = input_vectors.copy()19 words = np.array([vectorizer.vocabulary_[x] for x in set(wordlist) if x in vectorizer.vocabulary_])20 ret_vectors[:, words] = 021 return ret_vectors22def GetSuggestions(model, test_data, raw_data, test_labels):23 test_labels = np.array(test_labels)24 preds = (model.predict_proba(test_data)[:,1] > .5).astype(int)25 fp = np.where((preds == 1) * (test_labels == 0))[0]26 tp = np.where((preds == 1) * (test_labels == 1))[0]27 fn = np.where((preds == 0) * (test_labels == 1))[0]28 tn = np.where((preds == 0) * (test_labels == 0))[0]29 suggestions = []30 add_suggestion = lambda title, i: suggestions.append({'title' : 'ID %d (%s)' % (i, title), 'text' : raw_data[i], 'true_class' : test_labels[i]}) if i else None31 for a, b, c, d in itertools.izip_longest(fp[:15], tp[:15], fn[:15], tn[:15]):32 add_suggestion('FP', a) 33 add_suggestion('TP', b) 34 add_suggestion('FN', c) 35 add_suggestion('TN', d) 36 return suggestions37def GetSuggestionsPair(model1, model2, test_data, raw_data, test_labels, nn=False):38 test_labels = np.array(test_labels)39 preds1 = (model1.predict_proba(test_data)[:,1] > .5).astype(int)40 if nn:41 preds2 = (model2.predict_proba(raw_data)[:,1] > .5).astype(int)42 else:43 preds2 = (model2.predict_proba(test_data)[:,1] > .5).astype(int)44 fp_fp = np.where((preds1 == 1) * (test_labels == 0) * (preds2 == 1))[0]45 fp_tn = np.where((preds1 == 1) * (test_labels == 0) * (preds2 == 0))[0]46 tn_fp = np.where((preds1 == 0) * (test_labels == 0) * (preds2 == 1))[0]47 tn_tn = np.where((preds1 == 0) * (test_labels == 0) * (preds2 == 0))[0]48 fn_fn = np.where((preds1 == 0) * (test_labels == 1) * (preds2 == 0))[0]49 fn_tp = np.where((preds1 == 0) * (test_labels == 1) * (preds2 == 1))[0]50 tp_tp = np.where((preds1 == 1) * (test_labels == 1) * (preds2 == 1))[0]51 tp_fn = np.where((preds1 == 1) * (test_labels == 1) * (preds2 == 0))[0]52 suggestions = []53 add_suggestion = lambda title, i: suggestions.append({'title' : 'ID %d (%s)' % (i, title), 'text' : raw_data[i], 'true_class' : test_labels[i]}) if i else None54 for a, b, c, d , e, f, g, h in itertools.izip_longest(fp_fp[:15], fp_tn[:15], tn_fp[:15], tn_tn[:15], fn_fn[:15], fn_tp[:15], tp_fn[:15], tp_tp[:15]):55 add_suggestion('FP-FP', a) 56 add_suggestion('FP-TN', b) 57 add_suggestion('TN-FP', c) 58 add_suggestion('TN-TN', d) 59 add_suggestion('FN-FN', e) 60 add_suggestion('FN-TP', f) 61 add_suggestion('TP-FN', g) 62 add_suggestion('TP-TP', h) 63 return suggestions64 65def LoadPoliteness(path, percent_test=.1):66 data = []67 labels = []68 with open(path) as csvfile:69 reader = csv.DictReader(csvfile)70 for row in reader:71 data.append((row['Request'], float(row['Normalized Score'])))72 data = sorted(data, key=lambda x:x[1])73 quartile_len = len(data) / 474 negatives = [x[0] for x in data[:quartile_len]]75 positives = [x[0] for x in data[-quartile_len:]]76 random.seed(1)77 random.shuffle(positives)78 random.shuffle(negatives)79 size_test = int(len(negatives) * percent_test)80 size_train = len(negatives) - size_test81 train = positives[:size_train] + negatives[:size_train]82 train_labels = np.hstack((np.ones(size_train), np.zeros(size_train))).astype('int')83 test = positives[size_train:] + negatives[size_train:]84 test_labels = np.hstack((np.ones(size_test), np.zeros(size_test))).astype('int')85 return train, train_labels, test, test_labels86def LearnPoliteness():87 train, train_labels, test, test_labels = LoadPoliteness('data/stanford_politeness/wikipedia.annotated.csv')88 vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=True, lowercase=False, min_df=10)89 vectorizer.fit(train + test)90 train_vectors = vectorizer.transform(train)91 test_vectors = vectorizer.transform(test)92 svm = sklearn.svm.SVC(probability=True, kernel='rbf', C=10,gamma=0.001)93 svm.fit(train_vectors, train_labels)94 rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500, n_jobs=10)95 rf.fit(train_vectors, train_labels)96 lr = sklearn.linear_model.LogisticRegression()97 lr.fit(train_vectors, train_labels)98 suggestions = {}99 suggestions['lr'] = GetSuggestions(lr, test_vectors, test, test_labels)100 suggestions['rf'] = GetSuggestions(rf, test_vectors, test, test_labels)101 suggestions['svm'] = GetSuggestions(svm, test_vectors, test, test_labels)102 suggestions['lr-rf'] = GetSuggestionsPair(lr, rf, test_vectors, test, test_labels)103 suggestions['lr-svm'] = GetSuggestionsPair(lr, svm, test_vectors, test, test_labels)104 suggestions['rf-svm'] = GetSuggestionsPair(rf, svm, test_vectors, test, test_labels)105 ret = {} 106 ret['svm'] = {}107 ret['svm']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, svm.predict(test_vectors))108 ret['svm']['model'] = svm109 ret['rf'] = {}110 ret['rf']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, rf.predict(test_vectors))111 ret['rf']['model'] = rf112 ret['lr'] = {}113 ret['lr']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, lr.predict(test_vectors))114 ret['lr']['model'] = lr115 ret['vectorizer'] = vectorizer116 ret['class_names'] = ['rude', 'polite']117 return ret, suggestions118def Load20NG():119 cats = ['alt.atheism', 'soc.religion.christian']120 newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)121 newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)122 train, train_labels = newsgroups_train.data, newsgroups_train.target123 test, test_labels = newsgroups_test.data, newsgroups_test.target124 return train, train_labels, test, test_labels125def Learn20NG():126 train, train_labels, test, test_labels = Load20NG()127 vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=True, lowercase=False)128 vectorizer.fit(train + test)129 train_vectors = vectorizer.transform(train)130 test_vectors = vectorizer.transform(test)131 svm = sklearn.svm.SVC(probability=True, kernel='rbf', C=10,gamma=0.001)132 svm.fit(train_vectors, train_labels)133 rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500, n_jobs=10)134 rf.fit(train_vectors, train_labels)135 lr = sklearn.linear_model.LogisticRegression()136 lr.fit(train_vectors, train_labels)137 # This wordlist achieves 78.02% accuracy on the religion dataset138 wordlist = 'in,to,Re,In,1993,rutgers,athos,writes,article,12,And,you,on,heart,will,Chuck,not,gvg47,gvg,He,this,may,10,us,When,before,alt,uk,co,mantis,up,post,Distribution,You,Keith,kmr4,Ryan,Bill,pooh,for,the,Host,Posting,NNTP,New,Thanks,anyone,email,has,Newsreader,Nntp,wrote,agree,Sandvik,edu,clh,by,who,thoughts,thing,saturn,wwc,more,EDU,try,wouldn,am,as,world,livesey,Livesey,wpd,solntze,jon,from,it,cc,little,Conner,osrhe,here,VMS,don,than,13,would,also,18,about,University,TIN,FAQ,version,even,PL9,said,being,Yet,so,he,they,interested,geneva,17,athena,May,love,me,whether,St,COM,Inc,newton,TEK,Kent,mean,sandvik,Or,Beaverton,lot,week,need,education,our,Robert,Don,Reply,cs,which,Computer,Organization,rusnews,Jim,bmd,trw,deleted,position,now,isn,whole,mathew,00,05,Michael,subject,CA,Princeton,po,CWRU,okcforum,bil,GMT,Bake,Timmons,timmbake,mcl,sgi,au,Dan,com,Unix'.split(',')139 cleaned_train = clean_vectors_wordlist(train_vectors, vectorizer, wordlist)140 cleansvm = sklearn.svm.SVC(probability=True, kernel='rbf', C=10,gamma=0.001)141 cleansvm.fit(cleaned_train, train_labels)142 rfemb = embedding_forest.EmbeddingForest(vectorizer)143 rfemb.fit(train_vectors, train_labels)144 suggestions = {}145 suggestions['lr'] = GetSuggestions(lr, test_vectors, test, test_labels)146 suggestions['rf'] = GetSuggestions(rf, test_vectors, test, test_labels)147 suggestions['rfemb'] = GetSuggestions(rfemb, test_vectors, test, test_labels)148 suggestions['svm'] = GetSuggestions(svm, test_vectors, test, test_labels)149 suggestions['cleansvm'] = GetSuggestions(cleansvm, test_vectors, test, test_labels)150 suggestions['cleansvm-lr'] = GetSuggestionsPair(cleansvm, lr, test_vectors, test, test_labels)151 suggestions['cleansvm-rf'] = GetSuggestionsPair(cleansvm, rf, test_vectors, test, test_labels)152 suggestions['cleansvm-rfemb'] = GetSuggestionsPair(cleansvm, rfemb, test_vectors, test, test_labels)153 suggestions['cleansvm-svm'] = GetSuggestionsPair(cleansvm, svm, test_vectors, test, test_labels)154 suggestions['lr-rf'] = GetSuggestionsPair(lr, rf, test_vectors, test, test_labels)155 suggestions['lr-rfemb'] = GetSuggestionsPair(lr, rfemb, test_vectors, test, test_labels)156 suggestions['lr-svm'] = GetSuggestionsPair(lr, svm, test_vectors, test, test_labels)157 suggestions['lr-cleansvm'] = GetSuggestionsPair(lr, svm, test_vectors, test, test_labels)158 suggestions['rf-rfemb'] = GetSuggestionsPair(rf, rfemb, test_vectors, test, test_labels)159 suggestions['rf-svm'] = GetSuggestionsPair(rf, svm, test_vectors, test, test_labels)160 suggestions['rfemb-svm'] = GetSuggestionsPair(rfemb, svm, test_vectors, test, test_labels)161 ret = {} 162 ret['svm'] = {}163 ret['svm']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, svm.predict(test_vectors))164 ret['svm']['model'] = svm165 ret['cleansvm'] = {}166 ret['cleansvm']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, cleansvm.predict(test_vectors))167 ret['cleansvm']['model'] = cleansvm168 ret['rf'] = {}169 ret['rf']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, rf.predict(test_vectors))170 ret['rf']['model'] = rf171 ret['rfemb'] = {}172 ret['rfemb']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, rfemb.predict(test_vectors))173 ret['rfemb']['model'] = rfemb174 ret['lr'] = {}175 ret['lr']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, lr.predict(test_vectors))176 ret['lr']['model'] = lr177 ret['vectorizer'] = vectorizer178 ret['class_names'] = ['Atheism', 'Christian']179 return ret, suggestions180def LoadSentimentFile(path):181 data = []182 labels = []183 for line in open(path):184 x, y = line.decode('utf-8', 'ignore').strip().split('\t')185 data.append(x)186 labels.append(int(y))187 return data, labels188def LoadSentiment():189 train, train_labels = LoadSentimentFile('data/sentiment-train')190 test, test_labels = LoadSentimentFile('data/sentiment-test')191 return train, train_labels, test, test_labels192def LearnSentiment():193 train, train_labels, test, test_labels = LoadSentiment()194 vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=True, lowercase=False, min_df=10) 195 vectorizer.fit(train + test) 196 train_vectors = vectorizer.transform(train) 197 test_vectors = vectorizer.transform(test) 198 rf = sklearn.ensemble.RandomForestClassifier(n_estimators=500, n_jobs=10)199 rf.fit(train_vectors, train_labels) 200 lr = sklearn.linear_model.LogisticRegression() 201 lr.fit(train_vectors, train_labels) 202 DummyModel = collections.namedtuple('model', ['predict_proba'])203 nn = DummyModel(lstm.GetLSTM())204 suggestions = {}205 suggestions['lr'] = GetSuggestions(lr, test_vectors, test, test_labels)206 suggestions['rf'] = GetSuggestions(rf, test_vectors, test, test_labels)207 suggestions['nn'] = GetSuggestions(nn, test, test, test_labels)208 suggestions['lr-rf'] = GetSuggestionsPair(lr, rf, test_vectors, test, test_labels)209 suggestions['lr-nn'] = GetSuggestionsPair(lr, nn, test_vectors, test, test_labels, nn=True)210 suggestions['rf-nn'] = GetSuggestionsPair(rf, nn, test_vectors, test, test_labels, nn=True)211 ret = {} 212 ret['nn'] = {}213 ret['nn']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, (nn.predict_proba(test)[:,1] > .5).astype(int))214 ret['rf'] = {}215 ret['rf']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, rf.predict(test_vectors))216 ret['rf']['model'] = rf217 ret['lr'] = {}218 ret['lr']['accuracy'] = sklearn.metrics.accuracy_score(test_labels, lr.predict(test_vectors))219 ret['lr']['model'] = lr220 ret['vectorizer'] = vectorizer221 ret['class_names'] = ['Negative', 'Positive']222 return ret, suggestions223def main():224 suggestions = {}225 ret = {}226 ret['politeness'], suggestions['politeness'] = LearnPoliteness()227 ret['20ng'], suggestions['20ng'] = Learn20NG()228 ret['sentiment'], suggestions['sentiment'] = LearnSentiment()229 joblib.dump(ret, 'models/models')230 acc = {}231 for dataset in ret:232 acc[dataset] = {}233 for model in ret[dataset]:234 if model == 'class_names' or model == 'vectorizer':235 continue236 acc[dataset][model] = ret[dataset][model]['accuracy']237 ret_suggestions = {'suggestions' : suggestions, 'accuracy' : acc}238 json.dump(ret_suggestions, open('static/suggestions.json', 'w'))239if __name__ == '__main__':...
q8. Multiclass Logistic Regression.py
Source:q8. Multiclass Logistic Regression.py
1import numpy as np 2import pandas as pd 3import os 4# import matplotlib.pyplot as plt5def sigmoid(X):6 return 1.0/(1+np.exp(-X))7def logisticRegression(features,labels,learning_rate=0.01,epochs=1000,test=False,test_features=None,test_labels=None):8 """9 features: (number_examples,number_features)10 labels: (number_examples,1)11 12 returns: weights,bias13 """14 number_examples = features.shape[0]15 labels = np.reshape(labels,(number_examples,1))16 number_params = features.shape[1]17 W = np.random.randn(number_params,1)18 b = np.random.randn()19 for epoch in range(epochs):20 Z = np.dot(features,W) + b21 A = sigmoid(Z)22 loss = (labels.T).dot(np.log(A)) + (1-labels).T.dot(np.log(1-A))23 delta = (A - labels)24 dW = np.dot(features.T,delta)25 W += -learning_rate*dW/number_examples26 b += -learning_rate*np.sum(delta)/number_examples27 # if epoch % 10 == 0: 28 # print(-loss)29 if test:30 if test_labels.any() != None and test_features.any() != None and test_features.shape[0] == test_labels.shape[0]:31 Z = np.dot(test_features,W) + b32 A = sigmoid(Z)33 accuracy = 034 A[A > 0.5] = 135 A[A < 0.5] = 0 36 test_size = test_features.shape[0]37 for datapoint in range(test_size):38 if A[datapoint] == test_labels[datapoint]:39 accuracy += 140 print('accuracy: ',accuracy/test_size*100,'\n') 41 else:42 raise ValueError 43 return W,b44np.random.seed(1)45cwd = os.getcwd()46file_name = 'data4.xlsx'47file_path = cwd + '\\'+file_name48excel_data = pd.ExcelFile(file_path).parse('Sheet1',header=None)49copy_data = excel_data.values50# print('\n\n\n',i,'\n\n\n')51np.random.shuffle(copy_data)52data = copy_data53data_size = data.shape[0]54train_data_size = int(data_size*0.6)55test_data_size = data_size - train_data_size56train_data = np.copy(data[:train_data_size])57test_data = np.copy(data[train_data_size:])58# print(train_data.shape,test_data.shape)59train_features = train_data[:,:4]60train_labels = train_data[:,4]61test_features = test_data[:,:4]62test_labels = test_data[:,4]63train_features = (train_features - np.mean(train_features,axis=0))/(np.std(train_features,axis=0))64test_features = (test_features - np.mean(test_features,axis=0))/(np.std(test_features,axis=0))65# print(train_labels)66W_ova = {}67b_ova = {}68for i in range(1,4):69 train_labels[train_labels == i] = 170 train_labels[train_labels != 1] = 071 test_labels[test_labels == i] = 172 test_labels[test_labels != 1] = 073 # print(train_labels,'\n',i,'\n')74 _W,_b = logisticRegression(train_features,train_labels,learning_rate=0.05,test=True,test_features=test_features,test_labels=test_labels)75 W_ova[str(i)] = _W76 b_ova[str(i)] = _b77# print(W_ova,b_ova) 78W_ovo = {}79b_ovo = {}80for i in range(1,4):81 for j in range(i,4):82 if i == j: continue83 # print('\n\n\n\n',i,j)84 data_i = copy_data[np.where(copy_data[:,-1] == i),:]85 data_j = copy_data[np.where(copy_data[:,-1] == j),:]86 # print(data_i,data_j87 number_examples = data_i.shape[1]88 number_features = data_i.shape[2]89 data_i = np.reshape(data_i,(number_examples,number_features))90 data_j = np.reshape(data_j,(number_examples,number_features)) 91 dataset = np.vstack((data_i,data_j))92 dataset_size = dataset.shape[0]93 # print(data_i,data_j)94 np.random.shuffle(dataset)95 train_data_size = int(0.6*dataset_size)96 test_data_size = dataset_size - train_data_size97 train_data = np.copy(dataset[:train_data_size])98 test_data = np.copy(dataset[train_data_size:])99 # print(train_data.shape,test_data.shape)100 train_features = train_data[:,:4]101 train_labels = train_data[:,4]102 test_features = test_data[:,:4]103 test_labels = test_data[:,4]104 train_features = (train_features - np.mean(train_features,axis=0))/(np.std(train_features,axis=0))105 test_features = (test_features - np.mean(test_features,axis=0))/(np.std(test_features,axis=0))106 107 np.random.shuffle(dataset)108 train_labels[train_labels == i] = 1109 train_labels[train_labels != 1] = 0110 test_labels[test_labels == i] = 1111 test_labels[test_labels != 1] = 0112 print(j,i,'\n')113 _W,_b = logisticRegression(train_features,train_labels,learning_rate=0.1,epochs=2000,test=True,test_features=test_features,test_labels=test_labels)114 W_ovo[str(i)+str(j)] = _W115 b_ovo[str(i)+str(j)] = _b116# print(W_ovo)...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!