hedonodon/SentiTooter.py

from germansentiment import SentimentModel
from pandas import DataFrame
import numpy as np
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from deep_translator import GoogleTranslator
import spacy
from collections import Counter

# Preprocess text (username and link placeholders)
def preprocess(text:str) -> str:
    """Removes tags and urls from text.

    Parameters
    ------
        text: str
        The raw toot from Mastodon.
    Returns
    ------
        str
        The cleaned text.
    """
    new_text = []

    for t in text.split(" "):
        t = '' if t.startswith('@') and len(t) > 1 else t
        t = '' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


class SentiTooter:
    """Class to analyze the toots.
    """

    def __init__(self):
        """Initilize the sentiment models and labels.
        """
        self.deModel = SentimentModel()
        self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
        self.enModel, self.enTokenizer = self.initModel()
        # https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt
        self.labels = ['negative', 'neutral', 'positive']
        self.sia = SentimentIntensityAnalyzer()

    def analyze(self, language:str, content:str) -> list[str, str, float]:
        """Analyzes the sentiments of the toots.

        Parameters
        ------
            language: str
            The language tag of the toot.
            content: str
            The toot content.
        Returns
        ------
            list[str, str, float]
            A list with the sentiment, analyzer type, and sentiment score.
        """
        match language:
            case 'de':
                sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
                sentiment = sentimentList[0]
                score = {i[0]: i[1] for i in probabilitiesList[0]}[sentiment]
                return [sentiment, 'germanSentiment', score]
            case 'en':
                text = preprocess(content)
                encoded_input = self.enTokenizer(text, return_tensors='pt')
                output = self.enModel(**encoded_input)
                scores = output[0][0].detach().numpy()
                scores = softmax(scores)
                #print(scores)
                sentimentIndexWithMaxScore = np.argmax(scores)
                sentimentLabel = self.labels[sentimentIndexWithMaxScore]
                sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment', max(scores)]
                #print(sentiment)
                return sentiment
            case _:
                compound = self.sia.polarity_scores(content)['compound']
                #print(self.sia.polarity_scores(content), 'vaderSentiment')
                if compound > (1 / 3):
                    return ['positive', 'vaderSentiment']
                elif compound < (-1 / 3):
                    return ['negative', 'vaderSentiment']
                else:
                    return ['neutral', 'vaderSentiment']


    def initModel(self):
        """Initialize the english models.

        Returns
        ------
            tupel
                The pretrained model and tokenizer.
        """
        # PT
        tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
        tokenizer.save_pretrained(self.enModelType)
        model = AutoModelForSequenceClassification.from_pretrained(self.enModelType)
        model.save_pretrained(self.enModelType)
        return model, tokenizer

def translateToots(yesterdaysToots:DataFrame) -> DataFrame:
    """Translates all toots to english.

    Returns
    ------
        Dataframe
        Containing the english translated toots.
    """
    yesterdaysTootsTranslated = yesterdaysToots
    for index, row in yesterdaysTootsTranslated.iterrows():
        if (row['language'] != 'en'):
            try:
                yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot'])
                yesterdaysTootsTranslated.at[index,'language'] = 'en'
            except:
                yesterdaysTootsTranslated.drop(index)
    return yesterdaysTootsTranslated

def translateToot(language:str, toot:str) -> str:
    """Translate a toot in english.

    Parameters
    ------
        language:str
        The language of the toot.
        toot: str
        The toot content.

    Returns
    ------
        str
        The in english translated toot.
    """
    content = preprocess(toot)
    return GoogleTranslator(source=language, target='en').translate(content)

def countWords(concatedToots: str, number: int) -> list:
    """Counts the word frequencies in all toots of a given sentiment.

    Parameters
    ------
        concatedToots: str
        All toots from a sentiment.
        number: int
        Number of words to calculate word frequencies.

    Returns
    ------
        list
        List containing tuple of word and word frequency.
    """
    nlp = spacy.load('en_core_web_md')
    doc = nlp(concatedToots)

    # noun tokens that arent stop words or punctuations
    nouns = [token.text
            for token in doc
            if (not token.is_stop and
                not token.is_punct and
                token.pos_ == "NOUN")]

    # five most common noun tokens
    noun_freq = Counter(nouns)
    return noun_freq.most_common(number)

def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str:
    """Count all word frequencies of all toots per sentiment.

    Paramters
    ------
        translatedToots: DataFrame
        The dataframe with all toots in english.

    Returns
    ------
        str
        Containing words and wourd counts per sentiment.
    """
    sentimentList = []
    for sentiment in ['positive', 'neutral', 'negative']:
        tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot
        wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5)
        FrequenciesList = []
        for Frequencies in wordFrequencies:
             FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')')
        list2String = ', '.join(FrequenciesList)
        sentimentString = sentiment + ': ' + list2String
        sentimentList.append(sentimentString)
    wordFrequenciessPerSentiments = '\n'.join(sentimentList)
    return wordFrequenciessPerSentiments