add code documentation

2023-03-17 20:06:01 +01:00 · 2023-03-17 20:06:01 +01:00 · bc842244c7
commit bc842244c7
parent 4479bd2429
7 changed files with 261 additions and 31 deletions
--- a/SentiTooter.py
+++ b/SentiTooter.py
@ -1,4 +1,5 @@
 from germansentiment import SentimentModel
+from pandas import DataFrame
 import numpy as np
 from scipy.special import softmax
 from transformers import AutoModelForSequenceClassification
@ -9,7 +10,18 @@ import spacy
 from collections import Counter

 # Preprocess text (username and link placeholders)
-def preprocess(text):
+def preprocess(text:str) -> str:
+    """Removes tags and urls from text.
+
+    Parameters
+    ------
+        text: str
+        The raw toot from Mastodon.
+    Returns
+    ------
+        str
+        The cleaned text.
+    """
    new_text = []

    for t in text.split(" "):
@ -20,9 +32,12 @@ def preprocess(text):


 class SentiTooter:
-    """"""
+    """Class to analyze the toots.
+    """

    def __init__(self):
+        """Initilize the sentiment models and labels.
+        """
        self.deModel = SentimentModel()
        self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
        self.enModel, self.enTokenizer = self.initModel()
@ -30,7 +45,20 @@ class SentiTooter:
        self.labels = ['negative', 'neutral', 'positive']
        self.sia = SentimentIntensityAnalyzer()

-    def analyze(self, language, content):
+    def analyze(self, language:str, content:str) -> list[str, str, float]:
+        """Analyzes the sentiments of the toots.
+
+        Parameters
+        ------
+            language: str
+            The language tag of the toot.
+            content: str
+            The toot content.
+        Returns
+        ------
+            list[str, str, float]
+            A list with the sentiment, analyzer type, and sentiment score.
+        """
        match language:
            case 'de':
                sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
@ -61,6 +89,13 @@ class SentiTooter:


    def initModel(self):
+        """Initialize the english models.
+
+        Returns
+        ------
+            tupel
+                The pretrained model and tokenizer.
+        """
        # PT
        tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
        tokenizer.save_pretrained(self.enModelType)
@ -68,7 +103,14 @@ class SentiTooter:
        model.save_pretrained(self.enModelType)
        return model, tokenizer

-def translateToots(yesterdaysToots):
+def translateToots(yesterdaysToots:DataFrame) -> DataFrame:
+    """Translates all toots to english.
+
+    Returns
+    ------
+        Dataframe
+        Containing the english translated toots.
+    """
    yesterdaysTootsTranslated = yesterdaysToots
    for index, row in yesterdaysTootsTranslated.iterrows():
        if (row['language'] != 'en'):
@ -79,11 +121,39 @@ def translateToots(yesterdaysToots):
                yesterdaysTootsTranslated.drop(index)
    return yesterdaysTootsTranslated

-def translateToot(language, toot):
+def translateToot(language:str, toot:str) -> str:
+    """Translate a toot in english.
+
+    Parameters
+    ------
+        language:str
+        The language of the toot.
+        toot: str
+        The toot content.
+
+    Returns
+    ------
+        str
+        The in english translated toot.
+    """
    content = preprocess(toot)
    return GoogleTranslator(source=language, target='en').translate(content)

-def countWords(concatedToots, count):
+def countWords(concatedToots: str, number: int) -> list:
+    """Counts the word frequencies in all toots of a given sentiment.
+
+    Parameters
+    ------
+        concatedToots: str
+        All toots from a sentiment.
+        number: int
+        Number of words to calculate word frequencies.
+
+    Returns
+    ------
+        list
+        List containing tuple of word and word frequency.
+    """
    nlp = spacy.load('en_core_web_md')
    doc = nlp(concatedToots)

@ -96,18 +166,30 @@ def countWords(concatedToots, count):

    # five most common noun tokens
    noun_freq = Counter(nouns)
-    return noun_freq.most_common(count)
+    return noun_freq.most_common(number)

-def createWordCountPerSentiment(translatedToots):
+def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str:
+    """Count all word frequencies of all toots per sentiment.
+
+    Paramters
+    ------
+        translatedToots: DataFrame
+        The dataframe with all toots in english.
+
+    Returns
+    ------
+        str
+        Containing words and wourd counts per sentiment.
+    """
    sentimentList = []
    for sentiment in ['positive', 'neutral', 'negative']:
        tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot
-        wordCounts = countWords(tootsSeries.str.cat(sep=' '), 5)
-        countList = []
-        for count in wordCounts:
-             countList.append(str(count[0]) + ' (' + str(count[1]) + ')')
-        list2String = ', '.join(countList)
+        wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5)
+        FrequenciesList = []
+        for Frequencies in wordFrequencies:
+             FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')')
+        list2String = ', '.join(FrequenciesList)
        sentimentString = sentiment + ': ' + list2String
        sentimentList.append(sentimentString)
-    wordCountsPerSentiments = '\n'.join(sentimentList)
-    return wordCountsPerSentiments
+    wordFrequenciessPerSentiments = '\n'.join(sentimentList)
+    return wordFrequenciessPerSentiments