195 lines
No EOL
6.4 KiB
Python
195 lines
No EOL
6.4 KiB
Python
from germansentiment import SentimentModel
|
|
from pandas import DataFrame
|
|
import numpy as np
|
|
from scipy.special import softmax
|
|
from transformers import AutoModelForSequenceClassification
|
|
from transformers import AutoTokenizer
|
|
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
|
from deep_translator import GoogleTranslator
|
|
import spacy
|
|
from collections import Counter
|
|
|
|
# Preprocess text (username and link placeholders)
|
|
def preprocess(text:str) -> str:
|
|
"""Removes tags and urls from text.
|
|
|
|
Parameters
|
|
------
|
|
text: str
|
|
The raw toot from Mastodon.
|
|
Returns
|
|
------
|
|
str
|
|
The cleaned text.
|
|
"""
|
|
new_text = []
|
|
|
|
for t in text.split(" "):
|
|
t = '' if t.startswith('@') and len(t) > 1 else t
|
|
t = '' if t.startswith('http') else t
|
|
new_text.append(t)
|
|
return " ".join(new_text)
|
|
|
|
|
|
class SentiTooter:
|
|
"""Class to analyze the toots.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initilize the sentiment models and labels.
|
|
"""
|
|
self.deModel = SentimentModel()
|
|
self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
|
|
self.enModel, self.enTokenizer = self.initModel()
|
|
# https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt
|
|
self.labels = ['negative', 'neutral', 'positive']
|
|
self.sia = SentimentIntensityAnalyzer()
|
|
|
|
def analyze(self, language:str, content:str) -> list[str, str, float]:
|
|
"""Analyzes the sentiments of the toots.
|
|
|
|
Parameters
|
|
------
|
|
language: str
|
|
The language tag of the toot.
|
|
content: str
|
|
The toot content.
|
|
Returns
|
|
------
|
|
list[str, str, float]
|
|
A list with the sentiment, analyzer type, and sentiment score.
|
|
"""
|
|
match language:
|
|
case 'de':
|
|
sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
|
|
sentiment = sentimentList[0]
|
|
score = {i[0]: i[1] for i in probabilitiesList[0]}[sentiment]
|
|
return [sentiment, 'germanSentiment', score]
|
|
case 'en':
|
|
text = preprocess(content)
|
|
encoded_input = self.enTokenizer(text, return_tensors='pt')
|
|
output = self.enModel(**encoded_input)
|
|
scores = output[0][0].detach().numpy()
|
|
scores = softmax(scores)
|
|
#print(scores)
|
|
sentimentIndexWithMaxScore = np.argmax(scores)
|
|
sentimentLabel = self.labels[sentimentIndexWithMaxScore]
|
|
sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment', max(scores)]
|
|
#print(sentiment)
|
|
return sentiment
|
|
case _:
|
|
compound = self.sia.polarity_scores(content)['compound']
|
|
#print(self.sia.polarity_scores(content), 'vaderSentiment')
|
|
if compound > (1 / 3):
|
|
return ['positive', 'vaderSentiment']
|
|
elif compound < (-1 / 3):
|
|
return ['negative', 'vaderSentiment']
|
|
else:
|
|
return ['neutral', 'vaderSentiment']
|
|
|
|
|
|
def initModel(self):
|
|
"""Initialize the english models.
|
|
|
|
Returns
|
|
------
|
|
tupel
|
|
The pretrained model and tokenizer.
|
|
"""
|
|
# PT
|
|
tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
|
|
tokenizer.save_pretrained(self.enModelType)
|
|
model = AutoModelForSequenceClassification.from_pretrained(self.enModelType)
|
|
model.save_pretrained(self.enModelType)
|
|
return model, tokenizer
|
|
|
|
def translateToots(yesterdaysToots:DataFrame) -> DataFrame:
|
|
"""Translates all toots to english.
|
|
|
|
Returns
|
|
------
|
|
Dataframe
|
|
Containing the english translated toots.
|
|
"""
|
|
yesterdaysTootsTranslated = yesterdaysToots
|
|
for index, row in yesterdaysTootsTranslated.iterrows():
|
|
if (row['language'] != 'en'):
|
|
try:
|
|
yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot'])
|
|
yesterdaysTootsTranslated.at[index,'language'] = 'en'
|
|
except:
|
|
yesterdaysTootsTranslated.drop(index)
|
|
return yesterdaysTootsTranslated
|
|
|
|
def translateToot(language:str, toot:str) -> str:
|
|
"""Translate a toot in english.
|
|
|
|
Parameters
|
|
------
|
|
language:str
|
|
The language of the toot.
|
|
toot: str
|
|
The toot content.
|
|
|
|
Returns
|
|
------
|
|
str
|
|
The in english translated toot.
|
|
"""
|
|
content = preprocess(toot)
|
|
return GoogleTranslator(source=language, target='en').translate(content)
|
|
|
|
def countWords(concatedToots: str, number: int) -> list:
|
|
"""Counts the word frequencies in all toots of a given sentiment.
|
|
|
|
Parameters
|
|
------
|
|
concatedToots: str
|
|
All toots from a sentiment.
|
|
number: int
|
|
Number of words to calculate word frequencies.
|
|
|
|
Returns
|
|
------
|
|
list
|
|
List containing tuple of word and word frequency.
|
|
"""
|
|
nlp = spacy.load('en_core_web_md')
|
|
doc = nlp(concatedToots)
|
|
|
|
# noun tokens that arent stop words or punctuations
|
|
nouns = [token.text
|
|
for token in doc
|
|
if (not token.is_stop and
|
|
not token.is_punct and
|
|
token.pos_ == "NOUN")]
|
|
|
|
# five most common noun tokens
|
|
noun_freq = Counter(nouns)
|
|
return noun_freq.most_common(number)
|
|
|
|
def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str:
|
|
"""Count all word frequencies of all toots per sentiment.
|
|
|
|
Paramters
|
|
------
|
|
translatedToots: DataFrame
|
|
The dataframe with all toots in english.
|
|
|
|
Returns
|
|
------
|
|
str
|
|
Containing words and wourd counts per sentiment.
|
|
"""
|
|
sentimentList = []
|
|
for sentiment in ['positive', 'neutral', 'negative']:
|
|
tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot
|
|
wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5)
|
|
FrequenciesList = []
|
|
for Frequencies in wordFrequencies:
|
|
FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')')
|
|
list2String = ', '.join(FrequenciesList)
|
|
sentimentString = sentiment + ': ' + list2String
|
|
sentimentList.append(sentimentString)
|
|
wordFrequenciessPerSentiments = '\n'.join(sentimentList)
|
|
return wordFrequenciessPerSentiments |