def clean_text(text):
'''
This function will:
1. remove punctuations
2. remove common words
3. return the cleaned text
'''
# create a list of stop words
stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
no_punc = [word for word in text if word not in string.punctuation]
no_punc_str = "".join(no_punc)
return "".join([word for word in no_punc_str if word.lower() not in stop_words])
df["cleaned"] = df["Sentence"].apply(clean_text)
# Encoding the Sentiment column
df["mapped_sentiments"] = np.where(df["Sentiment"]=="positive", 1, np.where(df["Sentiment"]=="negative", -1, 0))
st.header('Clean data')
st.dataframe(df.head())
import pandas as pd
import numpy as np
import string
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump
# Load Data
dbconn = pq.dbconnect('dw_123')
df = dbconn.fetch('db_name', 'schema_name', 'support_tickets', df = True)
df = df.drop('Prediction', axis=1)
# Show a title (st = Streamlit module)
st.title("Sentiment Analysis")
# Show some text
st.text("Sample data")
# Show the dataframe
st.dataframe(df.head(), use_container_width=True)
def clean_text(text):
'''
This function will:
1. remove punctuations
2. remove common words
3. return the cleaned text
'''
# create a list of stop words
stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
no_punc = [word for word in text if word not in string.punctuation]
no_punc_str = "".join(no_punc)
return "".join([word for word in no_punc_str if word.lower() not in stop_words])
df["cleaned"] = df["Sentence"].apply(clean_text)
# Encoding the Sentiment column
df["mapped_sentiments"] = np.where(df["Sentiment"]=="positive", 1, np.where(df["Sentiment"]=="negative", -1, 0))
st.header('Clean data')
st.dataframe(df.head())
X = df["cleaned"]
Y = df["mapped_sentiments"]
# Creating train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state = 42)
# Initializing vectorizer and fitting it on training data
vec = TfidfVectorizer()
vec.fit(X_train)
# Transforming sentences to numeric format(sparse metrices)
X_train_vec = vec.transform(X_train)
X_test_vec = vec.transform(X_test)
model = SVC(kernel='linear', C=0.6, probability=True)
model.fit(X_train_vec, Y_train)
pred = model.predict(X_test_vec)
st.header('Evaluation')
accuracy = accuracy_score(Y_test, pred)
st.text("Accuracy: " + str(accuracy))
# Saving the model & vectorizer
dump(model, '/data_app/model_financial_sentiment')
dump(vec, '/data_app/vectorizer_financial_sentiment')
st.success('Model saved successfully!')