hedonodon/SentiTooter.py
2023-03-15 14:27:07 +01:00

99 lines
No EOL
3.8 KiB
Python

from germansentiment import SentimentModel
import numpy as np
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from deep_translator import GoogleTranslator
import spacy
from collections import Counter
# Preprocess text (username and link placeholders)
def preprocess(text):
new_text = []
for t in text.split(" "):
t = '' if t.startswith('@') and len(t) > 1 else t
t = '' if t.startswith('http') else t
new_text.append(t)
return " ".join(new_text)
class SentiTooter:
""""""
def __init__(self):
self.deModel = SentimentModel()
self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
self.enModel, self.enTokenizer = self.initModel()
# https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt
self.labels = ['negative', 'neutral', 'positive']
self.sia = SentimentIntensityAnalyzer()
def analyze(self, language, content):
match language:
case 'de':
sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
sentiment = sentimentList[0]
score = {i[0]: i[1] for i in probabilitiesList[0]}[sentiment]
return [sentiment, 'germanSentiment', score]
case 'en':
text = preprocess(content)
encoded_input = self.enTokenizer(text, return_tensors='pt')
output = self.enModel(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
#print(scores)
sentimentIndexWithMaxScore = np.argmax(scores)
sentimentLabel = self.labels[sentimentIndexWithMaxScore]
sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment', max(scores)]
#print(sentiment)
return sentiment
case _:
compound = self.sia.polarity_scores(content)['compound']
#print(self.sia.polarity_scores(content), 'vaderSentiment')
if compound > (1 / 3):
return ['positive', 'vaderSentiment']
elif compound < (-1 / 3):
return ['negative', 'vaderSentiment']
else:
return ['neutral', 'vaderSentiment']
def initModel(self):
# PT
tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
tokenizer.save_pretrained(self.enModelType)
model = AutoModelForSequenceClassification.from_pretrained(self.enModelType)
model.save_pretrained(self.enModelType)
return model, tokenizer
def translateToots(yesterdaysToots):
yesterdaysTootsTranslated = yesterdaysToots
for index, row in yesterdaysTootsTranslated.iterrows():
if (row['language'] != 'de'):
try:
yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot'])
yesterdaysTootsTranslated.at[index,'language'] = 'de'
except:
yesterdaysTootsTranslated.drop(index)
return yesterdaysTootsTranslated
def translateToot(language, toot):
content = preprocess(toot)
return GoogleTranslator(source=language, target='de').translate(content)
def countWords(concatedToots, count):
nlp = spacy.load('de_core_news_sm')
doc = nlp(concatedToots)
# noun tokens that arent stop words or punctuations
nouns = [token.text
for token in doc
if (not token.is_stop and
not token.is_punct and
token.pos_ == "NOUN")]
# five most common noun tokens
noun_freq = Counter(nouns)
return noun_freq.most_common(count)