Best Python code snippet using gherkin-python
BBC_NEWS.py
Source:BBC_NEWS.py
...57 'Keywords_CountV':x['Keywords_CountV'],58 'Content':x['Content']59 })60 """61def feature_keywords(outfile,collection,vectorizer):62 collection.delete_many({})63 data = open(outfile).read()64 labels, texts = [], []65 for i, line in enumerate(data.split("\n")):66 if len(line.split()) > 0:67 content = line.split()68 labels.append(content[0])69 texts.append(" ".join(content[1:]))70 wordslist = texts71 titlelist = labels72 transformer = TfidfTransformer()73 tfidf = vectorizer.fit_transform(wordslist)74 #print(tfidf)75 #print(vectorizer.fit_transform(wordslist))76 words = vectorizer.get_feature_names() #ææææ¬çå
³é®å77 weight = tfidf.toarray()78 n = 5 # åäºä½79 for (title, w, text) in zip(titlelist, weight, texts):80 wordsdet= ['and','of','the','to','in','will','students','project','subject','assessment','hours','with','on','be','for','you','he','she','her','his']81 print (u'{}:'.format(title))82 # æåº83 loc = np.argsort(-w)84 keywordsList = []85 #Keywords = ''86 i,j=0,087 while j < n:88 if words[loc[i]] in wordsdet:89 i += 190 continue91 keywordsList.append(words[loc[i]]+',')92 print (u'-{}: {} {}'.format(str(j + 1), words[loc[i]], w[loc[i]]))93 i +=194 j +=195 Keywords = ''.join(keywordsList)96 post = {97 'Label': title,98 'KeyWords': Keywords,99 'Content': text100 }101 collection.insert_one(post)102 print ('\n')103def TSNE(outfile):104 data = open(outfile).read()105 labels, texts = [], []106 for i, line in enumerate(data.split("\n")):107 if len(line.split()) > 0:108 content = line.split()109 labels.append(content[0])110 texts.append(" ".join(content[1:]))111 #å建ä¸ä¸ªdataframeï¼åå为textålabel112 trainDF = pandas.DataFrame()113 trainDF['seriesNum'] = range(0, 2225)114 trainDF['label'] = labels115 trainDF['text'] = texts116 trainDF['category_id'] = trainDF['label'].factorize()[0]117 labels = trainDF['category_id']118 category_id_df = trainDF[['label', 'category_id']].drop_duplicates().sort_values('category_id')119 category_to_id = dict(category_id_df.values)120 id_to_category = dict(category_id_df[['category_id', 'label']].values)121 tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')122 features = tfidf.fit_transform(trainDF['text']).toarray()123 SAMPLE_SIZE = int(len(features) * 0.3)124 np.random.seed(0)125 indices = np.random.choice(range(len(features)), size=SAMPLE_SIZE, replace=False)126 projected_features = manifold.TSNE(n_components=2, random_state=0).fit_transform(features[indices])127 colors = ['pink', 'green', 'midnightblue', 'orange', 'darkgrey']128 for category, category_id in sorted(category_to_id.items()):129 points = projected_features[(labels[indices] == category_id).values]130 plt.scatter(points[:, 0], points[:, 1], s=15, c=colors[category_id], label=category)131 plt.title("tf-idf feature vector for each article, projected on 2 dimensions.",132 fontdict=dict(fontsize=15))133 plt.legend()134 plt.show()135def multiple_classify(outfile, collection,classifier):136 collection.delete_many({}) #éæ°è¾å
¥137 data = open(outfile).read()138 labels, texts = [], []139 for i, line in enumerate(data.split("\n")):140 if len(line.split()) > 0:141 content = line.split()142 labels.append(content[0])143 texts.append(" ".join(content[1:]))144 #å建ä¸ä¸ªdataframeï¼åå为textålabel145 trainDF = pandas.DataFrame()146 trainDF['seriesNum'] = range(0, 2225)147 trainDF['label'] = labels148 trainDF['text'] = texts149 # print(trainDF)150 pipeline = Pipeline([151 ('tdidf_vectorizer', TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)),152 ('classifier', classifier)153 ])154 k_fold = model_selection.KFold(n_splits=5, shuffle=True)155 scores = []156 confusion = np.zeros((5,5))157 for train_indices, test_indices in k_fold.split(trainDF):158 sub_seriesNum = trainDF['seriesNum'][test_indices].tolist()159 train_text = trainDF['text'][train_indices]160 train_y = trainDF['label'][train_indices]161 test_text = trainDF['text'][test_indices]162 test_text_list = test_text.tolist()163 test_y = trainDF['label'][test_indices]164 encoder = preprocessing.LabelEncoder()165 train_y = encoder.fit_transform(train_y)166 test_y = encoder.fit_transform(test_y)167 pipeline.fit(train_text, train_y)168 predictions = pipeline.predict(test_text)169 for i in range(len(predictions)):170 post = {171 'Num': int(sub_seriesNum[i]),172 'Predict_Label': int(predictions[i]),173 'Actual_Label': int(test_y[i]),174 'Content': test_text_list[i]175 }176 collection.insert_one(post)177 confusion += confusion_matrix(test_y, predictions)178 score = f1_score(test_y, predictions, average='macro')179 scores.append(score)180 print('Total news classified:', len(trainDF))181 print('Score:', sum(scores)/len(scores))182 print('Confusion matrix:')183 print(confusion)184 print('\n')185def main():186 # Set up database187 myclient = pymongo.MongoClient("mongodb://localhost:27017/")188 db = myclient['BBC_NEWS']189 collection_set = {190 #'Act' : db['Student_Account'],191 #'Sbj_Info': db['Subject_Info'],192 'BBC': db['BBC_News'],193 'BBC_Result': db['BBC_News_ClassificationResult'],194 'BBC_Biagram':db['BBC_News_FeatureKeyWords_tdidf_Biagram'],195 'BBC_countV':db['BBC_News_FeatureKeyWords_CountVectorizer'],196 'BBC_tdidf':db['BBC_News_FeatureKeyWords_tdidf']197 }198 vectorizer_set = {199 'Tdidf_diagram': TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english'),200 'Count': CountVectorizer(),201 'Tdidf': TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', stop_words='english')202 }203 # Preprocess204 targetfile = "Dataset/bbc-text.csv"205 outfile = database.preprocess(targetfile)206 #TSNE(outfile)207 # Origin CSV insert to database208 Originfile_insert(outfile, collection_set['BBC'])209 # keywords210 #feature_keywords(outfile,collection_set['BBC_Biagram'],vectorizer_set['Tdidf_diagram'])211 #feature_keywords(outfile,collection_set['BBC_countV'],vectorizer_set['Count'])212 #feature_keywords(outfile,collection_set['BBC_tdidf'],vectorizer_set['Tdidf'])213 # Classify214 classifier_set = {215 'NB': MultinomialNB(),216 'SVM': SVC(kernel='linear'),217 'DT': tree.DecisionTreeClassifier()218 }219 multiple_classify(outfile, collection_set['BBC_Result'],classifier_set['NB'])220 multiple_classify(outfile, collection_set['BBC_Result'],classifier_set['SVM'])221 #multiple_classify(outfile, collection_set['BBC_Result'],classifier_set['DT'])222 # Database Input223 database.LabelDecoder(collection_set['BBC_Result'])224 pipeline = [{225 '$lookup': {226 'from': 'BBC_News_FeatureKeyWords_CountVectorizer',227 'localField': 'Content',228 'foreignField' : 'Content',229 'as': 'Keywords_CountV'230 }},{231 '$lookup': {232 'from': 'BBC_News_FeatureKeyWords_tdidf',233 'localField': 'Content',234 'foreignField' : 'Content',235 'as': 'Keywords_tdidf'236 }},{237 '$lookup': {238 'from': 'BBC_News_FeatureKeyWords_tdidf_Biagram',239 'localField': 'Content',240 'foreignField' : 'Content',241 'as': 'Keywords_tdidf_Biagram'242 }},{243 '$project': {244 'Label':{'$arrayElemAt':['$Keywords_CountV.Label',0]},245 'Keywords_tdidf': {'$arrayElemAt':['$Keywords_tdidf.KeyWords', 0]},246 'Keywords_tdidf_Biagram': {'$arrayElemAt':['$Keywords_tdidf_Biagram.KeyWords', 0]},247 'Keywords_CountV': {'$arrayElemAt':['$Keywords_CountV.KeyWords', 0]},248 'Content':{'$arrayElemAt':['$Keywords_CountV.Content',0]}}}249 ]250 update_result = db['BBC_News'].aggregate(pipeline)251 for x in update_result:252 #print(x)253 db['BBC_News'].update({254 '_id':x['_id']255 }, {256 'Label':x['Label'],257 'Keywords_tdidf':x['Keywords_tdidf'],258 'Keywords_tdidf_Biagram':x['Keywords_tdidf_Biagram'],259 'Keywords_CountV':x['Keywords_CountV'],260 'Content':x['Content']261 })262 query = {263 "$where": "this.Predict_Label != this.Actual_Label"264 }265 answer = collection_set['BBC_Result'].find(query).sort('Num')266 #for x in answer:267 # print(x)268 #feature_keywords("/Users/frank/PycharmProjects/FYP_classification/RAKE-tutorial/articles/txt/EIE3105.pdf.txt")269if __name__=='__main__':...
testing.py
Source:testing.py
1# -*- coding: utf-8 -*-2"""3 pygments.lexers.testing4 ~~~~~~~~~~~~~~~~~~~~~~~5 Lexers for testing languages.6 :copyright: Copyright 2006-2015 by the Pygments team, see AUTHORS.7 :license: BSD, see LICENSE for details.8"""9from pygments.lexer import RegexLexer, include, bygroups10from pygments.token import Comment, Keyword, Name, String11__all__ = ['GherkinLexer']12class GherkinLexer(RegexLexer):13 """14 For `Gherkin <http://github.com/aslakhellesoy/gherkin/>` syntax.15 .. versionadded:: 1.216 """17 name = 'Gherkin'18 aliases = ['cucumber', 'gherkin']19 filenames = ['*.feature']20 mimetypes = ['text/x-gherkin']21 feature_keywords = u'^(기ë¥|æ©è½|åè½|ãã£ã¼ãã£|خاصÙØ©|ת××× ×|ФÑнкÑÑонал|ФÑнкÑионалноÑÑ|ФÑнкÑионал|ФиÑа|ÐÑобина|ÐогÑÑноÑÑ|Ãzellik|WÅaÅciwoÅÄ|TÃnh nÄng|Trajto|SavybÄ|Požiadavka|Požadavek|Osobina|Ominaisuus|Omadus|OH HAI|MoguÄnost|Mogucnost|JellemzÅ|FÄ«Äa|Funzionalità |Funktionalität|Funkcionalnost|FunkcionalitÄte|FuncÈionalitate|Functionaliteit|Functionalitate|Funcionalitat|Funcionalidade|Fonctionnalité|Fitur|Feature|Egenskap|Egenskab|Crikey|CaracterÃstica|Arwedd)(:)(.*)$'22 feature_element_keywords = u'^(\\s*)(ìëë¦¬ì¤ ê°ì|ìë리ì¤|ë°°ê²½|èæ¯|å ´æ¯å¤§ç¶±|å ´æ¯|åºæ¯å¤§çº²|åºæ¯|åæ¬å¤§ç¶±|åæ¬|ãã³ãã¬|ã·ããªãªãã³ãã¬ã¼ã|ã·ããªãªãã³ãã¬|ã·ããªãªã¢ã¦ãã©ã¤ã³|ã·ããªãª|سÙÙارÙÙ Ù
خطط|سÙÙارÙÙ|اÙØ®ÙÙÙØ©|תר××ש|ת×× ×ת תר××ש|רקע|ТаÑиÑ
|СÑенаÑÑй|СÑенаÑио|СÑенаÑий ÑÑÑÑкÑÑÑаÑи|СÑенаÑий|СÑÑÑкÑÑÑа ÑÑенаÑÑÑ|СÑÑÑкÑÑÑа ÑÑенаÑиÑа|СÑÑÑкÑÑÑа ÑÑенаÑиÑ|СкиÑа|Рамка на ÑÑенаÑий|ÐÑимеÑ|ÐÑедÑÑÑоÑиÑ|ÐÑедиÑÑоÑиÑ|Ðозадина|ÐеÑедÑмова|ÐÑнова|ÐонÑепÑ|ÐонÑекÑÑ|ZaÅożenia|Wharrimean is|Tình huá»ng|The thing of it is|Tausta|Taust|Tapausaihio|Tapaus|Szenariogrundriss|Szenario|Szablon scenariusza|Stsenaarium|Struktura scenarija|Skica|Skenario konsep|Skenario|SituÄcija|Senaryo taslaÄı|Senaryo|ScénáÅ|Scénario|Schema dello scenario|ScenÄrijs pÄc parauga|ScenÄrijs|Scenár|Scenaro|Scenariusz|Scenariul de Åablon|Scenariul de sablon|Scenariu|Scenario Outline|Scenario Amlinellol|Scenario|Scenarijus|Scenarijaus Å¡ablonas|Scenarij|Scenarie|Rerefons|Raamstsenaarium|Primer|PozadÃ|Pozadina|Pozadie|Plan du scénario|Plan du Scénario|Osnova scénáÅe|Osnova|NáÄrt ScénáÅe|NáÄrt Scenáru|Mate|MISHUN SRSLY|MISHUN|Ká»ch bản|Konturo de la scenaro|Kontext|Konteksts|Kontekstas|Kontekst|Koncept|Khung tình huá»ng|Khung ká»ch bản|Háttér|Grundlage|GeçmiÅ|Forgatókönyv vázlat|Forgatókönyv|Fono|Esquema do Cenário|Esquema do Cenario|Esquema del escenario|Esquema de l\'escenari|Escenario|Escenari|Dis is what went down|Dasar|Contexto|Contexte|Contesto|CondiÅ£ii|Conditii|Cenário|Cenario|Cefndir|Bá»i cảnh|Blokes|Bakgrunn|Bakgrund|Baggrund|Background|B4|Antecedents|Antecedentes|All y\'all|Achtergrond|Abstrakt Scenario|Abstract Scenario)(:)(.*)$'23 examples_keywords = u'^(\\s*)(ì|ä¾å|ä¾|ãµã³ãã«|اÙ
Ø«ÙØ©|××××××ת|СÑенаÑиÑи|ÐÑимеÑи|ÐÑиклади|ÐиÑоллаÑ|ÐнаÑениÑ|Ãrnekler|Voorbeelden|Variantai|Tapaukset|Scenarios|Scenariji|Scenarijai|PÅÃklady|Példák|PrÃklady|PrzykÅady|Primjeri|Primeri|PiemÄri|Pavyzdžiai|Paraugs|Juhtumid|Exemplos|Exemples|Exemplele|Exempel|Examples|Esempi|Enghreifftiau|Ekzemploj|Eksempler|Ejemplos|EXAMPLZ|Dữ liá»u|Contoh|Cobber|Beispiele)(:)(.*)$'24 step_keywords = u'^(\\s*)(íì§ë§|ì¡°ê±´|먼ì |ë§ì¼|ë§ì½|ë¨|ê·¸ë¦¬ê³ |ê·¸ë¬ë©´|é£éº¼|é£ä¹|èä¸|ç¶|å½|åæ|åè¨|åå¦|ä½æ¯|ä½ã|並ä¸|ãã|ãªãã°|ãã ã|ããã|ãã¤|Ù |Ù
ت٠|ÙÙÙ |عÙدÙ
ا |Ø«Ù
|بÙرض |اذا٠|××שר |××× |×××× ×ª× |××× |×× |××× |ЯкÑо |Унда |То |ÐÑипÑÑÑимо, Ñо |ÐÑипÑÑÑимо |Ðнда |Ðо |ÐеÑ
ай |Ðекин |ÐогаÑо |Ðада |Ðад |Ð ÑÐ¾Ð¼Ñ Ð¶Ðµ |Ð |ÐадаÑо |ÐадаÑи |ÐадаÑе |ÐÑли |ÐопÑÑÑим |Ðадено |Ðа |ÐиÑок |Ðммо |Ðли |Ðле |ÐÐ³Ð°Ñ |Ð |Ð |Èi |Ãs |Zatati |ZakÅadajÄ
c |Zadato |Zadate |Zadano |Zadani |Zadan |Youse know when youse got |Youse know like when |Yna |Ya know how |Ya gotta |Y |Wun |Wtedy |When y\'all |When |Wenn |WEN |Và |Ve |Und |Un |Thì |Then y\'all |Then |Tapi |Tak |Tada |Tad |SÃ¥ |Stel |Soit |Siis |Si |Sed |Se |Quando |Quand |Quan |Pryd |Pokud |Pokiaľ |Però |Pero |Pak |Oraz |Onda |Ond |Oletetaan |Og |Och |O zaman |NÃ¥r |När |Niin |NhÆ°ng |N |Mutta |Men |Mas |Maka |Majd |Mais |Maar |Ma |Lorsque |Lorsqu\'|Kun |Kuid |Kui |Khi |KeÄ |Ketika |Když |Kaj |Kai |Kada |Kad |Jeżeli |Ja |Ir |I CAN HAZ |I |Ha |Givun |Givet |Given y\'all |Given |Gitt |Gegeven |Gegeben sei |Fakat |EÄer ki |Etant donné |Et |Então |Entonces |Entao |En |Eeldades |E |Duota |Dun |Donitaĵo |Donat |Donada |Do |Diyelim ki |Dengan |Den youse gotta |De |Dato |Dar |Dann |Dan |Dado |DacÄ |Daca |DEN |Când |Cuando |Cho |Cept |Cand |Cal |But y\'all |But |Buh |Biết |Bet |BUT |Atès |Atunci |Atesa |Anrhegedig a |Angenommen |And y\'all |And |An |Ama |Als |Alors |Allora |Ali |Aleshores |Ale |Akkor |Aber |AN |A také |A |\* )'25 tokens = {26 'comments': [27 (r'^\s*#.*$', Comment),28 ],29 'feature_elements': [30 (step_keywords, Keyword, "step_content_stack"),31 include('comments'),32 (r"(\s|.)", Name.Function),33 ],34 'feature_elements_on_stack': [35 (step_keywords, Keyword, "#pop:2"),36 include('comments'),37 (r"(\s|.)", Name.Function),38 ],39 'examples_table': [40 (r"\s+\|", Keyword, 'examples_table_header'),41 include('comments'),42 (r"(\s|.)", Name.Function),43 ],44 'examples_table_header': [45 (r"\s+\|\s*$", Keyword, "#pop:2"),46 include('comments'),47 (r"\\\|", Name.Variable),48 (r"\s*\|", Keyword),49 (r"[^|]", Name.Variable),50 ],51 'scenario_sections_on_stack': [52 (feature_element_keywords,53 bygroups(Name.Function, Keyword, Keyword, Name.Function),54 "feature_elements_on_stack"),55 ],56 'narrative': [57 include('scenario_sections_on_stack'),58 include('comments'),59 (r"(\s|.)", Name.Function),60 ],61 'table_vars': [62 (r'(<[^>]+>)', Name.Variable),63 ],64 'numbers': [65 (r'(\d+\.?\d*|\d*\.\d+)([eE][+-]?[0-9]+)?', String),66 ],67 'string': [68 include('table_vars'),69 (r'(\s|.)', String),70 ],71 'py_string': [72 (r'"""', Keyword, "#pop"),73 include('string'),74 ],75 'step_content_root': [76 (r"$", Keyword, "#pop"),77 include('step_content'),78 ],79 'step_content_stack': [80 (r"$", Keyword, "#pop:2"),81 include('step_content'),82 ],83 'step_content': [84 (r'"', Name.Function, "double_string"),85 include('table_vars'),86 include('numbers'),87 include('comments'),88 (r'(\s|.)', Name.Function),89 ],90 'table_content': [91 (r"\s+\|\s*$", Keyword, "#pop"),92 include('comments'),93 (r"\\\|", String),94 (r"\s*\|", Keyword),95 include('string'),96 ],97 'double_string': [98 (r'"', Name.Function, "#pop"),99 include('string'),100 ],101 'root': [102 (r'\n', Name.Function),103 include('comments'),104 (r'"""', Keyword, "py_string"),105 (r'\s+\|', Keyword, 'table_content'),106 (r'"', Name.Function, "double_string"),107 include('table_vars'),108 include('numbers'),109 (r'(\s*)(@[^@\r\n\t ]+)', bygroups(Name.Function, Name.Tag)),110 (step_keywords, bygroups(Name.Function, Keyword),111 'step_content_root'),112 (feature_keywords, bygroups(Keyword, Keyword, Name.Function),113 'narrative'),114 (feature_element_keywords,115 bygroups(Name.Function, Keyword, Keyword, Name.Function),116 'feature_elements'),117 (examples_keywords,118 bygroups(Name.Function, Keyword, Keyword, Name.Function),119 'examples_table'),120 (r'(\s|.)', Name.Function),121 ]...
disaster_clf.py
Source:disaster_clf.py
1import pandas as pd2import numpy as np3from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer4from sklearn.preprocessing import OneHotEncoder, LabelEncoder5from joblib import dump, load6import nltk7import re8from nltk.stem.wordnet import WordNetLemmatizer9import string10from nltk.tokenize import word_tokenize11from nltk.corpus import stopwords, wordnet12from nltk.corpus.reader import wordnet13from nltk.stem import LancasterStemmer, PorterStemmer14from sklearn.decomposition import TruncatedSVD15from sklearn.model_selection import train_test_split, GridSearchCV16from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB17from sklearn.ensemble import RandomForestClassifier18from sklearn.svm import LinearSVC, SVC19from sklearn.metrics import classification_report, accuracy_score20from sklearn.preprocessing import MinMaxScaler21from scipy.sparse import csr_matrix22from sklearn.linear_model import LogisticRegression23# in train set, keyword has NaN (missing value) = 61 rows24# in train set, location has NaN (missing value) = 2533 rows25TRAINING_FILE_NAME = 'dataset/train.csv'26KEYWORD_OHE_PATH = './lib/keyword_ohe.lib'27TEXT_VECTORIZER_PATH = './lib/text_vectorizer.lib'28TRAINING_DATAFRAME_PATH = './lib/training_df.lib'29TRAINING_TARGET_PATH = './lib/training_target.lib'30TESTING_TARGET_PATH = './lib/testing_target.lib'31TESTING_DATAFRAME_PATH = './lib/target_df.lib'32KEYWORD_LBE_PATH = './lib/keyword_lbe.lib'33USE_LABEL_ENCODER = False34SAVE_MODEL = False35USE_LEMMATIZER = False36USE_LANCASTER_STEM = True37TEST_PREDICT_FILE = './dataset/test.csv'38SUBMISSION_FILE = './submission/disaster_clf.csv'39SAMPLE_SUBMISSION_FILE = './dataset/sample_submission.csv'40def download_nltk_package():41 nltk.download('averaged_perceptron_tagger')42 nltk.download('words')43 nltk.download('punkt')44 nltk.download('stopwords')45 nltk.download('wordnet')46def data_info(df):47 print('keyword features')48 print('-------------------------------------')49 feature_keywords = df['keyword'].value_counts()50 print(feature_keywords)51 print('######################################')52 print(f'there are {feature_keywords.count()} unique features')53 print('-------------------------------------')54 55 print()56 print('location features')57 print('-------------------------------------')58 feature_locations = df['location'].value_counts()59 print(feature_locations)60 print('######################################')61 print(f'there are {feature_locations.count()} unique features')62 print('-------------------------------------')63 64 print()65 print('label')66 print('-------------------------------------')67 feature_locations = df['target'].value_counts()68 print(feature_locations)69 70 print()71def read_csv(file_name):72 data = pd.read_csv(file_name)73 # create dataframe74 train_df = pd.DataFrame(data)75 return train_df76# find part of speech of word77def get_wordnet_pos(word):78 tag = nltk.pos_tag([word])[0][1][0].upper()79 if tag.startswith('J'):80 return wordnet.ADJ81 elif tag.startswith('V'):82 return wordnet.VERB83 elif tag.startswith('N'):84 return wordnet.NOUN85 elif tag.startswith('R'):86 return wordnet.ADV87 else:88 return wordnet.VERB89def pre_process_text(df, use_lemmatizer, use_lancaster_stem):90 words = set(nltk.corpus.words.words())91 lemmatizer = WordNetLemmatizer()92 lancaster_stemmer = LancasterStemmer()93 porter_stemmer = PorterStemmer()94 stop_words = set(stopwords.words('english'))95 texts = []96 for _, row in df.iterrows():97 text = row['text']98 # remove word that is not in English corpus and transform them to lower case99 text = " ".join(w.lower() for w in nltk.wordpunct_tokenize(text) if w.lower() in words)100 # remove http tag101 text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\102 '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', text)103 104 #remove number105 text = re.sub(r'\d+','',text)106 107 #remove punctuation mark108 text = text.translate(str.maketrans('','', string.punctuation))109 110 #remove extra white space111 text = text.strip()112 113 # tokenize word (change to list of terms)114 text_tokenize = word_tokenize(text)115 116 # lemmatize (or stem, depends on the option) every word117 root_texts = []118 for word in text_tokenize:119 if use_lemmatizer:120 root_texts.append(lemmatizer.lemmatize(word, get_wordnet_pos(word)))121 elif use_lancaster_stem:122 root_texts.append(lancaster_stemmer.stem(word))123 else:124 root_texts.append(porter_stemmer.stem(word))125 # transform list to string 126 text = " ".join(root_texts)127 texts.append(text)128 df['text'] = texts129 return df130 131def pre_processing(df, keyword_ohe_path, keyword_lbe_path, text_vectorizer_path, 132 df_path, save_model, use_label_encoder, use_lemmmatizer, 133 use_lancaster_stem, vectorizer_input):134 preprocess_df = df135 shape = preprocess_df.shape[1]136 if use_label_encoder:137 # encode the keyword column using label encoder138 encoder = LabelEncoder()139 preprocess_df['keyword'] = encoder.fit_transform(preprocess_df['keyword'])140 if save_model:141 dump(encoder, keyword_lbe_path)142 else: 143 # encode the keyword column using one hot encoder144 encoder = OneHotEncoder()145 keyword_temp = np.array(preprocess_df['keyword']).reshape(-1,1)146 keyword_encoder = encoder.fit_transform(keyword_temp).toarray()147 new_keyword = pd.DataFrame(keyword_encoder)148 # dump keyword encoder149 if save_model:150 dump(encoder, keyword_ohe_path)151 # concat encoded keyword back to the dataset152 preprocess_df = pd.concat([preprocess_df.reset_index(drop=True), new_keyword.reset_index(drop=True)],axis=1)153 preprocess_df = pd.DataFrame(preprocess_df)154 preprocess_df.rename(columns=dict(zip(preprocess_df.columns[shape:], 155 np.array(encoder.categories_).ravel())), inplace=True)156 # perform text cleaning157 preprocess_df = pre_process_text(preprocess_df, use_lemmmatizer, use_lancaster_stem)158 159 vectorizer = None160 if vectorizer_input is None:161 # vectorizer = TfidfVectorizer(stop_words='english', sublinear_tf=True)162 vectorizer = CountVectorizer(stop_words='english')163 text_vector = vectorizer.fit_transform(preprocess_df['text']).toarray()164 else:165 vectorizer = vectorizer_input166 text_vector = vectorizer_input.transform(preprocess_df['text']).toarray()167 # # Truncated svd to remove dimensionality for sparse data168 # svd = TruncatedSVD(n_components=100, n_iter=10, random_state=42)169 # text_vector_tran = svd.fit_transform(text_vector)170 # new_text = pd.DataFrame(text_vector_tran)171 172 new_text = pd.DataFrame(text_vector)173 # dump text vectorizer174 if save_model:175 dump(vectorizer, text_vectorizer_path)176 # drop column keyword and text177 if not use_label_encoder:178 preprocess_df = preprocess_df.drop(columns='keyword')179 preprocess_df = preprocess_df.drop(columns='text')180 shape_2 = preprocess_df.shape[1]181 # concat vector of text to the dataset182 preprocess_df = pd.concat([preprocess_df.reset_index(drop=True), new_text.reset_index(drop=True)],axis=1)183 preprocess_df.rename(columns=dict(zip(preprocess_df.columns[shape_2:], 184 vectorizer.get_feature_names())), inplace=True)185 186 # dump dataframe187 if save_model:188 dump(preprocess_df, df_path)189 return preprocess_df, vectorizer190# download_nltk_package()191df = read_csv(TRAINING_FILE_NAME)192X = df.drop(columns='location')193# drop all row that column keyword is NaN194X = X.dropna()195y = X['target']196X = X.drop(columns=['id', 'target'])197X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)198X_train, vectorizer = pre_processing(X_train, KEYWORD_OHE_PATH, KEYWORD_LBE_PATH, TEXT_VECTORIZER_PATH, 199 TRAINING_DATAFRAME_PATH, SAVE_MODEL, USE_LABEL_ENCODER, 200 USE_LEMMATIZER, USE_LANCASTER_STEM, None)201if SAVE_MODEL:202 dump(y_train, TRAINING_TARGET_PATH)203 dump(y_test, TESTING_TARGET_PATH) 204X_test, _ = pre_processing(X_test, KEYWORD_OHE_PATH, KEYWORD_LBE_PATH, TEXT_VECTORIZER_PATH, 205 TESTING_DATAFRAME_PATH, SAVE_MODEL, USE_LABEL_ENCODER, 206 USE_LEMMATIZER, USE_LANCASTER_STEM, vectorizer)207# load model file, in case it's need208# X_train = load(TRAINING_DATAFRAME_PATH)209# vectorizer = load(TEXT_VECTORIZER_PATH)210# X_test = load(TESTING_DATAFRAME_PATH)211# y_train = load(TRAINING_TARGET_PATH)212# y_test = load(TESTING_TARGET_PATH)213# perform scaling (to fix negative value when training MultinomialNB())214# scaler = MinMaxScaler()215# X_train = scaler.fit_transform(X_train)216# scaler = MinMaxScaler()217# X_test = scaler.fit_transform(X_test)218clf = BernoulliNB()219# clf = MultinomialNB()220# clf = GaussianNB()221# clf = RandomForestClassifier(n_jobs=3, n_estimators=500, verbose=True)222# clf = LinearSVC()223# clf = SVC(kernel='linear')224# clf = LogisticRegression()225clf.fit(X_train, y_train)226print(clf)227y_pred = clf.predict(X_test)228print(accuracy_score(y_pred, y_test))229print(classification_report(y_pred, y_test))230# prepare file to submission to kaggle231test = read_csv(TEST_PREDICT_FILE)232sample_sub= read_csv(SAMPLE_SUBMISSION_FILE)233test = test.drop(columns=['location', 'id'])234test['keyword'] = test['keyword'].fillna('ablaze')235test, _ = pre_processing(test, KEYWORD_OHE_PATH, KEYWORD_LBE_PATH, TEXT_VECTORIZER_PATH, 236 TRAINING_DATAFRAME_PATH, SAVE_MODEL, USE_LABEL_ENCODER, 237 USE_LEMMATIZER, USE_LANCASTER_STEM, vectorizer)238sample_sub['target'] = clf.predict(test)...
experiment_factory.py
Source:experiment_factory.py
1"""Module for running decoding experiments."""2from pathlib import Path3from typing import Optional, Sequence, Union4import numpy as np5import pandas as pd6from joblib import Parallel, delayed7from sklearn.model_selection import BaseCrossValidator8import pte_decode9def run_experiment(10 feature_root: Union[Path, str],11 feature_files: Union[12 Path, str, list[Path], list[str], list[Union[Path, str]]13 ],14 n_jobs: int = 1,15 **kwargs,16) -> list[Optional[pte_decode.Experiment]]:17 """Run prediction experiment with given number of files."""18 if not feature_files:19 raise ValueError("No feature files specified.")20 if not isinstance(feature_files, list):21 feature_files = [feature_files]22 if len(feature_files) == 1 or n_jobs in (0, 1):23 return [24 _run_single_experiment(25 feature_root=feature_root,26 feature_file=feature_file,27 **kwargs,28 )29 for feature_file in feature_files30 ]31 return [32 Parallel(n_jobs=n_jobs)(33 delayed(_run_single_experiment)(34 feature_root=feature_root, feature_file=feature_file, **kwargs35 )36 for feature_file in feature_files37 )38 ] # type: ignore39def _run_single_experiment(40 feature_root: Union[Path, str],41 feature_file: Union[Path, str],42 classifier: str,43 label_channels: Sequence[str],44 target_begin: Union[str, int, float],45 target_end: Union[str, int, float],46 optimize: bool,47 balancing: Optional[str],48 out_root: Union[Path, str],49 use_channels: str,50 feature_keywords: Sequence,51 cross_validation: BaseCrossValidator,52 plot_target_channels: list[str],53 scoring: str = "balanced_accuracy",54 artifact_channels=None,55 bad_epochs_path: Optional[Union[Path, str]] = None,56 pred_mode: str = "classify",57 pred_begin: Union[int, float] = -3.0,58 pred_end: Union[int, float] = 2.0,59 use_times: int = 1,60 dist_onset: Union[int, float] = 2.0,61 dist_end: Union[int, float] = 2.0,62 excep_dist_end: Union[int, float] = 0.5,63 exceptions=None,64 feature_importance=False,65 verbose: bool = True,66) -> Optional[pte_decode.Experiment]:67 """Run experiment with single file."""68 import pte # pylint: disable=import-outside-toplevel69 from py_neuromodulation import (70 nm_analysis,71 ) # pylint: disable=import-outside-toplevel72 print("Using file: ", feature_file)73 # Read features using py_neuromodulation74 nm_reader = nm_analysis.Feature_Reader(75 feature_dir=str(feature_root), feature_file=str(feature_file)76 )77 features = nm_reader.feature_arr78 settings = nm_reader.settings79 sidecar = nm_reader.sidecar80 # Pick label for classification81 try:82 label = _get_column_picks(83 column_picks=label_channels,84 features=features,85 )86 except ValueError as error:87 print(error, "Discarding file: {feature_file}")88 return None89 # Handle bad events file90 bad_epochs_df = pte.filetools.get_bad_epochs(91 bad_epochs_dir=bad_epochs_path, filename=feature_file92 )93 bad_epochs = bad_epochs_df.event_id.to_numpy() * 294 # Pick target for plotting predictions95 target_series = _get_column_picks(96 column_picks=plot_target_channels,97 features=features,98 )99 features_df = get_feature_df(features, feature_keywords, use_times)100 # Pick artifact channel101 if artifact_channels:102 artifacts = _get_column_picks(103 column_picks=artifact_channels,104 features=features,105 ).to_numpy()106 else:107 artifacts = None108 # Generate output file name109 out_path = _generate_outpath(110 out_root,111 feature_file,112 classifier,113 target_begin,114 target_end,115 use_channels,116 optimize,117 use_times,118 )119 dist_end = _handle_exception_files(120 fullpath=out_path,121 dist_end=dist_end,122 excep_dist_end=excep_dist_end,123 exception_files=exceptions,124 )125 side = "right" if "R_" in str(out_path) else "left"126 decoder = pte_decode.get_decoder(127 classifier=classifier,128 scoring=scoring,129 balancing=balancing,130 optimize=optimize,131 )132 # Initialize Experiment instance133 experiment = pte_decode.Experiment(134 features=features_df,135 plotting_target=target_series,136 pred_label=label,137 ch_names=sidecar["ch_names"],138 decoder=decoder,139 side=side,140 artifacts=artifacts,141 bad_epochs=bad_epochs,142 sfreq=settings["sampling_rate_features"],143 scoring=scoring,144 feature_importance=feature_importance,145 target_begin=target_begin,146 target_end=target_end,147 dist_onset=dist_onset,148 dist_end=dist_end,149 use_channels=use_channels,150 pred_mode=pred_mode,151 pred_begin=pred_begin,152 pred_end=pred_end,153 cv_outer=cross_validation,154 verbose=verbose,155 )156 experiment.run()157 experiment.save_results(path=out_path)158 # experiment.fit_and_save(path=out_path)159 return experiment160def _handle_exception_files(161 fullpath: Union[Path, str],162 dist_end: Union[int, float],163 excep_dist_end: Union[int, float],164 exception_files: Optional[Sequence] = None,165):166 """Check if current file is listed in exception files."""167 if exception_files:168 if any(exc in str(fullpath) for exc in exception_files):169 print("Exception file recognized: ", Path(fullpath).name)170 return excep_dist_end171 return dist_end172def _generate_outpath(173 root: Union[Path, str],174 feature_file: Union[Path, str],175 classifier: str,176 target_begin: Union[str, int, float],177 target_end: Union[str, int, float],178 use_channels: str,179 optimize: bool,180 use_times: int,181) -> Path:182 """Generate file name for output files."""183 if target_begin == 0.0:184 target_begin = "trial_begin"185 if target_end == 0.0:186 target_end = "trial_begin"187 target_str = "_".join(("decode", str(target_begin), str(target_end)))188 clf_str = "_".join(("model", classifier))189 ch_str = "_".join(("chs", use_channels))190 opt_str = "yes_opt" if optimize else "no_opt"191 feat_str = "_".join(("feats", str(use_times * 100), "ms"))192 out_name = "_".join((target_str, clf_str, ch_str, opt_str, feat_str))193 return Path(root, out_name, feature_file, feature_file)194def get_feature_df(195 data: pd.DataFrame, feature_keywords: Sequence, use_times: int = 1196) -> pd.DataFrame:197 """Extract features to use from given DataFrame."""198 column_picks = [199 col200 for col in data.columns201 if any(pick in col for pick in feature_keywords)202 ]203 used_features = data[column_picks]204 # Initialize list of features to use205 features = [206 used_features.rename(207 columns={col: col + "_100_ms" for col in used_features.columns}208 )209 ]210 # Use additional features from previous time points211 # use_times = 1 means no features from previous time points are212 # being used213 for use_time in np.arange(1, use_times):214 features.append(215 used_features.shift(use_time, axis=0).rename(216 columns={217 col: col + "_" + str((use_time + 1) * 100) + "_ms"218 for col in used_features.columns219 }220 )221 )222 # Return final features dataframe223 return pd.concat(features, axis=1).fillna(0.0)224def _get_column_picks(225 column_picks: Sequence[str],226 features: pd.DataFrame,227) -> pd.Series:228 """Return first found column pick from features DataFrame."""229 for pick in column_picks:230 for col in features.columns:231 if pick.lower() in col.lower():232 return pd.Series(data=features[col], name=col)233 raise ValueError(234 f"No valid column found. `column_picks` given: {column_picks}."...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!