How to use text_clean method in autotest

Best Python code snippet using autotest_python

funcmap.py

Source:funcmap.py Github

copy

Full Screen

1import pandas as pd2import numpy as np3import sys4import textwrap5import plotly.express as px6import plotly.offline as pyo7import plotly.graph_objs as go8# taking the user input as a string9user_input = sys.argv[0]10# loading in the tweet dataset11df_earthquake = pd.read_csv('./final_labeled_earthquake.csv')12df_wildfire = pd.read_csv('./final_labeled_wildfire.csv')13df_hurricane = pd.read_csv('./final_labeled_hurricane.csv')14# create a function to create a list of 1s and 0s15def strict_matches_df(user_input, user_event):16 df = None17 if user_event == "hurricane":18 df = df_hurricane19 elif user_event == "wildfire":20 df = df_wildfire21 elif user_event == "earthquake":22 df = df_earthquake23 elif user_event == "all":24 df = pd.concat([df_earthquake, df_wildfire, df_hurricane])25 matches = []26 # split the entered keywords by comma27 keywords = user_input.split(',')28 # create an empty list to store keywords29 keyword_list = []30 for i in range(len(keywords)):31 # remove white space32 keywords[i] = keywords[i].strip()33 print(keywords[i])34 # append it to keyword_list35 keyword_list.append(keywords[i])36 # resetting the df to be just latitude longitude and clean text37 df = df[['lat', 'long', 'text_clean']]38 df['text_clean'] = df["text_clean"].apply(lambda t: "<br>".join(textwrap.wrap(t)))39 # filtering the dataframe to be include ONLY tweets that include ALL of the keywords40 if len(keyword_list) == 1:41 df = df[df['text_clean'].str.contains(keyword_list[0])]42 if len(keyword_list) == 2:43 df = df[df['text_clean'].str.contains(keyword_list[0]) & df['text_clean'].str.contains(keyword_list[1])]44 elif len(keyword_list) == 3:45 df = df[df['text_clean'].str.contains(keyword_list[0]) & df['text_clean'].str.contains(keyword_list[1]) & df['text_clean'].str.contains(keyword_list[2])]46 elif len(keyword_list) == 4:47 df = df[df['text_clean'].str.contains(keyword_list[0]) & df['text_clean'].str.contains(keyword_list[1]) & df['text_clean'].str.contains(keyword_list[2])48 & df['text_clean'].str.contains(keyword_list[3])]49 elif len(keyword_list) == 5:50 df = df[df['text_clean'].str.contains(keyword_list[0]) & df['text_clean'].str.contains(keyword_list[1]) & df['text_clean'].str.contains(keyword_list[2])51 & df['text_clean'].str.contains(keyword_list[3]) & df['text_clean'].str.contains(keyword_list[4])]52 elif len(keyword_list) == 6:53 df = df[df['text_clean'].str.contains(keyword_list[0]) & df['text_clean'].str.contains(keyword_list[1]) & df['text_clean'].str.contains(keyword_list[2])54 & df['text_clean'].str.contains(keyword_list[3]) & df['text_clean'].str.contains(keyword_list[4]) & df['text_clean'].str.contains(keyword_list[5])]55 return df56def loose_matches_df(user_input, user_event):57 df = None58 if user_event == "hurricane":59 df = df_hurricane60 elif user_event == "wildfire":61 df = df_wildfire62 elif user_event == "earthquake":63 df = df_earthquake64 elif user_event == "all":65 df = pd.concat([df_earthquake, df_wildfire])66 matches = []67 # split the entered keywords by comma68 keywords = user_input.split(',')69 # create an empty list to store keywords70 keyword_list = []71 for i in range(len(keywords)):72 # remove white space73 keywords[i] = keywords[i].strip()74 print(keywords[i])75 # append it to keyword_list76 keyword_list.append(keywords[i])77 # resetting the df to be just latitude longitude and clean text78 df = df[['lat', 'long', 'text_clean']]79 df['text_clean'] = df["text_clean"].apply(lambda t: "<br>".join(textwrap.wrap(t)))80 # filtering the dataframe to be include tweets that include ANY of the keywords81 if len(keyword_list) == 1:82 df = df[df['text_clean'].str.contains(keyword_list[0])]83 if len(keyword_list) == 2:84 df = df[df['text_clean'].str.contains(keyword_list[0]) | df['text_clean'].str.contains(keyword_list[1])]85 elif len(keyword_list) == 3:86 df = df[df['text_clean'].str.contains(keyword_list[0]) | df['text_clean'].str.contains(keyword_list[1]) | df['text_clean'].str.contains(keyword_list[2])]87 elif len(keyword_list) == 4:88 df = df[df['text_clean'].str.contains(keyword_list[0]) | df['text_clean'].str.contains(keyword_list[1]) | df['text_clean'].str.contains(keyword_list[2])89 | df['text_clean'].str.contains(keyword_list[3])]90 elif len(keyword_list) == 5:91 df = df[df['text_clean'].str.contains(keyword_list[0]) | df['text_clean'].str.contains(keyword_list[1]) | df['text_clean'].str.contains(keyword_list[2])92 | df['text_clean'].str.contains(keyword_list[3]) | df['text_clean'].str.contains(keyword_list[4])]93 elif len(keyword_list) == 6:94 df = df[df['text_clean'].str.contains(keyword_list[0]) | df['text_clean'].str.contains(keyword_list[1]) | df['text_clean'].str.contains(keyword_list[2])95 | df['text_clean'].str.contains(keyword_list[3]) | df['text_clean'].str.contains(keyword_list[4]) | df['text_clean'].str.contains(keyword_list[5])]96 return df97def map_strict(user_input, user_event):98 fig = px.scatter_mapbox(strict_matches_df(user_input, user_event), lat = 'lat', lon = 'long',99 color_discrete_sequence = ['navy'],hover_data = ['text_clean'], zoom = 3, height = 500)100 fig.update_layout(mapbox_style = 'open-street-map')101 fig.update_layout(margin = {'r': 0, 't': 0, 'l': 0, 'b':0})102 pyo.iplot(fig)103 return fig.write_html('./flask_map1.html')104def map_loose(user_input, user_event):105 fig = px.scatter_mapbox(loose_matches_df(user_input, user_event), lat = 'lat', lon = 'long',106 color_discrete_sequence = ['navy'],hover_data = ['text_clean'], zoom = 3, height = 500)107 fig.update_layout(mapbox_style = 'open-street-map')108 fig.update_layout(margin = {'r': 0, 't': 0, 'l': 0, 'b':0})109 pyo.iplot(fig)...

Full Screen

Full Screen

sentiment_analysis.py

Source:sentiment_analysis.py Github

copy

Full Screen

1import pandas as pd2from nltk import word_tokenize3from nltk import pos_tag4from nltk.corpus import stopwords5from sklearn.svm import LinearSVC, SVC6from sklearn import pipeline7from sklearn.base import BaseEstimator, TransformerMixin8from sklearn.pipeline import FeatureUnion9from sklearn.decomposition import TruncatedSVD10from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer11from sklearn.linear_model import LogisticRegression12from sklearn import model_selection13from sklearn.externals import joblib14from sklearn.metrics import accuracy_score15from sklearn import preprocessing16from sklearn.preprocessing import Normalizer17from itertools import groupby18import cPickle as pickle19import re20import os21#read different word lists and return a set of words22def read_wordList(file):23 with open(file, 'r') as f:24 return set([word.split('/')[0].strip().lower() for word in f if word])25def tokenize_n_character(text):26 return text.split()27def remove_punctuation(text):28 punctuation = '#@!"$%&()*+,-./:;<=>?[\]^_`{|}~' + "'"29 for p in punctuation:30 text = text.replace(p, "")31 return text32def remove_mentions(text, replace_token):33 return re.sub(r'(?:@[\w_]+)', replace_token, text)34def remove_hashtags(text, replace_token):35 return re.sub(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", replace_token, text)36def remove_url(text, replace_token):37 regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'38 return re.sub(regex, replace_token, text)39def get_emojis(path):40 emoji_dict = {}41 df_emojis = pd.read_csv(path, encoding="utf-8", delimiter=",")42 for index, row in df_emojis.iterrows():43 occurrences = row['Occurrences']44 pos = (row['Positive'] + 1) / (occurrences + 3)45 neg = (row['Negative'] + 1) / (occurrences + 3)46 sent = pos - neg47 emoji_dict[row['Emoji']] = sent48 return emoji_dict49def countCharacterFlooding(text):50 text = ''.join(text.split())51 groups = groupby(text)52 cnt = 053 for label, group in groups:54 char_cnt = sum(1 for _ in group)55 if char_cnt > 2:56 cnt += 157 return cnt58#count specific characters59def count_patterns(text, list):60 cnt=061 length = len(text)62 for pattern in list:63 cnt += text.count(pattern)64 if length == 0:65 return 066 return cnt/length67#get sentiment according to emojis68def get_sentiment(text, emoji_dict):69 sentiment = 070 list = emoji_dict.keys()71 for pattern in list:72 text_cnt = text.count(pattern)73 sentiment += emoji_dict[pattern] * text_cnt74 return sentiment75def get_affix(text):76 return " ".join([word[-4:] if len(word) >= 4 else word for word in text.split()])77def affix_punct(text):78 punct = '!"$%&()*+,-./:;<=>?[\]^_`{|}~'79 ngrams = []80 for i, character in enumerate(text[0:-2]):81 ngram = text[i:i+3]82 if ngram[0] in punct:83 for p in punct:84 if p in ngram[1:]:85 break86 else:87 ngrams.append(ngram)88 return "###".join(ngrams)89def affix_punct_tokenize(text):90 tokens = text.split('###')91 return tokens92def get_ngrams(text):93 ngrams = []94 for word in text.split():95 if len(word) > 4:96 for i in range(len(word) - 3):97 ngrams.append(word[i:i + 4])98 else :99 ngrams.append(word)100 print(ngrams)101 return " ".join(ngrams)102#fit and transform text features, used in scikit Feature union103class text_col(BaseEstimator, TransformerMixin):104 def __init__(self, key):105 self.key = key106 def fit(self, x, y=None):107 return self108 def transform(self, data_dict):109 return data_dict[self.key]110#fit and transform numeric features, used in scikit Feature union111class digit_col(BaseEstimator, TransformerMixin):112 def fit(self, x, y=None):113 return self114 def transform(self, hd_searches):115 d_col_drops=['text', 'no_punctuation', 'text_clean', 'affixes', 'affix_punct']116 hd_searches = hd_searches.drop(d_col_drops,axis=1).values117 scaler = preprocessing.MinMaxScaler().fit(hd_searches)118 return scaler.transform(hd_searches)119#preprocess and tag data and write it to csv for later use120def preprocess(df_data):121 df_data['text_clean_r'] = df_data['text'].map(lambda x: remove_hashtags(x, '#HASHTAG'))122 df_data['text_clean_r'] = df_data['text_clean_r'].map(lambda x: remove_url(x, "HTTPURL"))123 df_data['text_clean_r'] = df_data['text_clean_r'].map(lambda x: remove_mentions(x, '@MENTION'))124 df_data['text_clean'] = df_data['text'].map(lambda x: remove_hashtags(x, ''))125 df_data['text_clean'] = df_data['text_clean'].map(lambda x: remove_url(x, ""))126 df_data['text_clean'] = df_data['text_clean'].map(lambda x: remove_mentions(x, ''))127 df_data['no_punctuation'] = df_data['text_clean'].map(lambda x: remove_punctuation(x))128 df_data['text_clean'] = df_data['text_clean_r']129 df_data = df_data.drop('text_clean_r', 1)130 return df_data131def createFeatures(df_data):132 folder_path = os.path.dirname(os.path.realpath(__file__))133 emoji_path = os.path.join(folder_path, 'models', 'emoji_dataset.csv')134 emoji_dict = get_emojis(emoji_path)135 emoji_list = emoji_dict.keys()136 df_data['affixes'] = df_data['text_clean'].map(lambda x: get_affix(x))137 df_data['affix_punct'] = df_data['text_clean'].map(lambda x: affix_punct(x))138 df_data['number_of_emojis'] = df_data['text_clean'].map(lambda x: count_patterns(x, emoji_list))139 df_data['sentiment'] = df_data['text_clean'].map(lambda x: get_sentiment(x, emoji_dict))140 df_data['number_of_character_floods'] = df_data['no_punctuation'].map(lambda x: countCharacterFlooding(x))141 return df_data...

Full Screen

Full Screen

make_dataset.py

Source:make_dataset.py Github

copy

Full Screen

1# -*- coding: utf-8 -*-2""" Generate clean dataset from raw data """3import os4import logging5import re6import string7import pandas as pd8import emoji9from langdetect import detect10logger = logging.getLogger(__name__)11def clean_data(df_raw, text_col='Comment'):12 """ Given a dataframe and a text column, treats the text column of the dataframe and keeps13 only English text.14 """15 df_clean = df_raw.dropna().reset_index(drop=True)16 ids_en = []17 # Clean the text18 for idx, row in df_clean.iterrows():19 text = row[text_col]20 21 # Remove special characters22 text_clean = text.translate(str.maketrans('', '', '#$%&()*+<=>?@[\\]^_`{|}~'))23 text_clean = text_clean.replace("\"", " ")24 # Remove endlines and tabs25 text_clean = text_clean.replace("\n", " ")26 text_clean = text_clean.replace("\r", " ")27 text_clean = text_clean.replace("\xa0", " ")28 # Reduce multiple spces to one29 text_clean = re.sub(' +', ' ', text_clean)30 try:31 language = detect(text_clean)32 except:33 language = 'error'34 # If the text is in English, then keep the row35 if language == 'en':36 ids_en.append(idx)37 38 # Translate emojis to text39 text_clean = emoji.demojize(text_clean, delimiters=(" ", " "))40 41 df_clean.loc[idx,text_col] = text_clean42 df_clean = df_clean[df_clean.index.isin(ids_en)].reset_index(drop=True)43 return df_clean44def main():45 """ Runs data processing scripts to turn raw data from (../raw) into46 cleaned data ready to be analyzed (saved in ../processed).47 """48 logger.info('making final data set from raw data...')49 # Path of the script50 dirname = os.path.dirname(__file__)51 # Path to the raw data52 data_raw = os.path.join(dirname, "../../data/raw/comments.csv")53 df_raw = pd.read_csv(data_raw, index_col=0, sep=',')54 df_clean = clean_data(df_raw, text_col='Comment')55 logger.info('final data set created')56 # Path to the processed data57 data_processed = os.path.join(dirname, "../../data/processed/comments_clean.csv")58 df_clean.to_csv(data_processed)59if __name__ == '__main__':60 LOG_FMT = '%(asctime)s - %(levelname)s - %(message)s'61 logging.basicConfig(level=logging.INFO, format=LOG_FMT)...

Full Screen

Full Screen

Automation Testing Tutorials

Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.

LambdaTest Learning Hubs:

YouTube

You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.

Run autotest automation tests on LambdaTest cloud grid

Perform automation testing on 3000+ real desktop and mobile devices online.

Try LambdaTest Now !!

Get 100 minutes of automation test minutes FREE!!

Next-Gen App & Browser Testing Cloud

Was this article helpful?

Helpful

NotHelpful