add code documentation

2023-03-17 20:06:01 +01:00 · 2023-03-17 20:06:01 +01:00 · bc842244c7
commit bc842244c7
parent 4479bd2429
7 changed files with 261 additions and 31 deletions
--- a/CRUDManager.py
+++ b/CRUDManager.py
@ -3,7 +3,19 @@ import pandas as pd
 from sqlalchemy import desc, select, sql
 from Tables import Toots

+from pandas.core.api import (
+    DataFrame)
+
 def calculateSentimentCount():
+    """Calculates the frequencies of the sentiments.
+
+    Returns
+    -------
+    DataFrame
+        Containing date (YY-MM-DD), sentiment (positive, neutral, negative),
+        and sentimentCount.
+    """
+
    query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount
                FROM Toots
                GROUP BY DATE(datetime),
@ -16,7 +28,18 @@ def calculateSentimentCount():
        parse_dates=["datetime"]
    )

-def calculateSentimentMean(dataframe):
+def calculateSentimentMean(dataframe:DataFrame) -> DataFrame:
+    """Calculates the mean of the sentiments.
+
+    Parameters
+    -------
+        dataframe: DataFrame
+
+    Returns
+    -------
+        Dataframe
+        Containing date (YY-MM-DD), sentimentsMean.
+    """
    negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1
    positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum()
    sentimentSum = dataframe['sentimentCount'].sum()
@ -31,7 +54,14 @@ def calculateSentimentMean(dataframe):
        ]
    )

-def getYesterdaysToots():
+def getYesterdaysToots() -> DataFrame:
+    """Fetches yesterdays toots from database.
+
+    Returns
+    -------
+        pd.Dataframe
+        Containing date (YY-MM-DD), language, sentiment, toot.
+    """
    query = f'''SELECT datetime as date, language, sentiment, toot
                FROM Toots
                WHERE datetime >= DATE("now","-1 day")
@ -43,16 +73,49 @@ def getYesterdaysToots():
    )

 class CRUDManager():
+    """Class for database operations"""

-    def saveToDatabase(self, dataframe, table:str, useIndex=False):
+    def saveToDatabase(self, dataframe:DataFrame, table:str, useIndex=False):
+        """Saves dataframe to database.
+
+        Parameters
+        -------
+            dataframe: DataFrame
+                Input dataframe.
+            table: str
+                Table, where to save the data.
+            useIndex: boolean
+                Should the index of the dataframe be used as index for
+                the database table?
+        """
        try:
            dataframe.to_sql(table, engine, index=useIndex, if_exists="append")
        except:
            print(f'Could not save data to {table}!')

-    def loadFromDatabase(self, table:str, indexColumn=None):
+    def loadFromDatabase(self, table:str, indexColumn=None) -> DataFrame:
+        """Load a table into a dataframe.
+
+        Parameters
+        -------
+            table: str
+                Table, where to save the data.
+            indexColumn: str | None
+                Should the index of the table be used as index for
+                the dataframe?
+        Returns
+        -------
+            DataFrame
+        """
        return pd.read_sql_table(table, connection, index_col=indexColumn)

-    def getLastToot(self):
+    def getLastToot(self) -> str:
+        """Query the last toot id from database.
+
+        Results
+        -------
+            str
+            A toot id.
+        """
        stmt = select(Toots.tootId).order_by(desc('datetime'))
        return session.scalars(stmt).first()
--- a/DbSetup.py
+++ b/DbSetup.py
@ -1,3 +1,7 @@
+"""Script to initialize the database.
+     Serves database url, engine, connection and session.
+"""
+
 from sqlalchemy import create_engine
 from sqlalchemy.orm import Session
 from sqlalchemy.ext.declarative import declarative_base
@ -9,4 +13,6 @@ session = Session(engine)
 Base = declarative_base()

 def init_db():
+     """Initialize the database.
+     """
     Base.metadata.create_all(bind=engine)
--- a/Main.py
+++ b/Main.py
@ -1,3 +1,19 @@
+"""
+Hedonodon toot sentiment analyzer.
+
+This programm fetches toots from the fedihum.org Mastodon instance, calculates
+the frequencies of the sentiments (positive, neutral, negative) and the mean
+from these nominal values (even this is not statistical correct (;-_-)!, but
+not all analyzer return compounds).
+It also calculates the word count of the nouns per sentiment.
+
+It uses germansentiment for german toots, twitter-roberta-base-sentiment for
+english toots, and vaderSentiment for other languages.
+
+For the word counts I translate the toots to english with the GoogleTranslator
+first.
+"""
+
 from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots
 from datetime import datetime, date
 from DbSetup import init_db
@ -6,7 +22,7 @@ from MastodonAccountManager import MastodonAccountManager
 import matplotlib.pyplot as plt
 import matplotlib.dates as mdates
 from TootCrawler import TootCrawler
-from SentiTooter import translateToots, createWordCountPerSentiment
+from SentiTooter import translateToots, createWordFrequenciesPerSentiment

 locale.setlocale(locale.LC_TIME, "en_US.UTF-8")
 init_db()
@ -40,7 +56,7 @@ else:
 print('Calculate word counts...')
 yesterdaysToots = getYesterdaysToots()
 translatedToots = translateToots(yesterdaysToots)
-wordCountsPerSentiment = createWordCountPerSentiment(translatedToots)
+wordCountsPerSentiment = createWordFrequenciesPerSentiment(translatedToots)
 print('done!')

 print(wordCountsPerSentiment);
@ -116,6 +132,7 @@ plotFileUrl = f'./plots/{TodayDate}.png'
 plt.savefig(plotFileUrl)
 print('done!')

-media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.")
-mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en')
-
+print('Send toot...')
+#media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.")
+#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en')
+print('done!')
--- a/MastodonAccountManager.py
+++ b/MastodonAccountManager.py
@ -1,5 +1,7 @@
 from mastodon import Mastodon

 class MastodonAccountManager():
+    """Initialize the Mastodon account.
+    """
    def __init__(self):
        self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')
--- a/SentiTooter.py
+++ b/SentiTooter.py
@ -1,4 +1,5 @@
 from germansentiment import SentimentModel
+from pandas import DataFrame
 import numpy as np
 from scipy.special import softmax
 from transformers import AutoModelForSequenceClassification
@ -9,7 +10,18 @@ import spacy
 from collections import Counter

 # Preprocess text (username and link placeholders)
-def preprocess(text):
+def preprocess(text:str) -> str:
+    """Removes tags and urls from text.
+
+    Parameters
+    ------
+        text: str
+        The raw toot from Mastodon.
+    Returns
+    ------
+        str
+        The cleaned text.
+    """
    new_text = []

    for t in text.split(" "):
@ -20,9 +32,12 @@ def preprocess(text):


 class SentiTooter:
-    """"""
+    """Class to analyze the toots.
+    """

    def __init__(self):
+        """Initilize the sentiment models and labels.
+        """
        self.deModel = SentimentModel()
        self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
        self.enModel, self.enTokenizer = self.initModel()
@ -30,7 +45,20 @@ class SentiTooter:
        self.labels = ['negative', 'neutral', 'positive']
        self.sia = SentimentIntensityAnalyzer()

-    def analyze(self, language, content):
+    def analyze(self, language:str, content:str) -> list[str, str, float]:
+        """Analyzes the sentiments of the toots.
+
+        Parameters
+        ------
+            language: str
+            The language tag of the toot.
+            content: str
+            The toot content.
+        Returns
+        ------
+            list[str, str, float]
+            A list with the sentiment, analyzer type, and sentiment score.
+        """
        match language:
            case 'de':
                sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
@ -61,6 +89,13 @@ class SentiTooter:


    def initModel(self):
+        """Initialize the english models.
+
+        Returns
+        ------
+            tupel
+                The pretrained model and tokenizer.
+        """
        # PT
        tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
        tokenizer.save_pretrained(self.enModelType)
@ -68,7 +103,14 @@ class SentiTooter:
        model.save_pretrained(self.enModelType)
        return model, tokenizer

-def translateToots(yesterdaysToots):
+def translateToots(yesterdaysToots:DataFrame) -> DataFrame:
+    """Translates all toots to english.
+
+    Returns
+    ------
+        Dataframe
+        Containing the english translated toots.
+    """
    yesterdaysTootsTranslated = yesterdaysToots
    for index, row in yesterdaysTootsTranslated.iterrows():
        if (row['language'] != 'en'):
@ -79,11 +121,39 @@ def translateToots(yesterdaysToots):
                yesterdaysTootsTranslated.drop(index)
    return yesterdaysTootsTranslated

-def translateToot(language, toot):
+def translateToot(language:str, toot:str) -> str:
+    """Translate a toot in english.
+
+    Parameters
+    ------
+        language:str
+        The language of the toot.
+        toot: str
+        The toot content.
+
+    Returns
+    ------
+        str
+        The in english translated toot.
+    """
    content = preprocess(toot)
    return GoogleTranslator(source=language, target='en').translate(content)

-def countWords(concatedToots, count):
+def countWords(concatedToots: str, number: int) -> list:
+    """Counts the word frequencies in all toots of a given sentiment.
+
+    Parameters
+    ------
+        concatedToots: str
+        All toots from a sentiment.
+        number: int
+        Number of words to calculate word frequencies.
+
+    Returns
+    ------
+        list
+        List containing tuple of word and word frequency.
+    """
    nlp = spacy.load('en_core_web_md')
    doc = nlp(concatedToots)

@ -96,18 +166,30 @@ def countWords(concatedToots, count):

    # five most common noun tokens
    noun_freq = Counter(nouns)
-    return noun_freq.most_common(count)
+    return noun_freq.most_common(number)

-def createWordCountPerSentiment(translatedToots):
+def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str:
+    """Count all word frequencies of all toots per sentiment.
+
+    Paramters
+    ------
+        translatedToots: DataFrame
+        The dataframe with all toots in english.
+
+    Returns
+    ------
+        str
+        Containing words and wourd counts per sentiment.
+    """
    sentimentList = []
    for sentiment in ['positive', 'neutral', 'negative']:
        tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot
-        wordCounts = countWords(tootsSeries.str.cat(sep=' '), 5)
-        countList = []
-        for count in wordCounts:
-             countList.append(str(count[0]) + ' (' + str(count[1]) + ')')
-        list2String = ', '.join(countList)
+        wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5)
+        FrequenciesList = []
+        for Frequencies in wordFrequencies:
+             FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')')
+        list2String = ', '.join(FrequenciesList)
        sentimentString = sentiment + ': ' + list2String
        sentimentList.append(sentimentString)
-    wordCountsPerSentiments = '\n'.join(sentimentList)
-    return wordCountsPerSentiments
+    wordFrequenciessPerSentiments = '\n'.join(sentimentList)
+    return wordFrequenciessPerSentiments
--- a/Tables.py
+++ b/Tables.py
@ -1,3 +1,5 @@
+"""This script containing the table definitions for the database."""
+
 from DbSetup import Base
 from sqlalchemy import Column, Date, Integer, Float, String

@ -14,8 +16,6 @@ class Toots(Base):
    userName = Column(String(255))
    userId = Column(String(255))

-
-
 class SentimentCounts(Base):
    __tablename__ = 'SentimentCounts'
    __table_args__ = {'extend_existing': True}
--- a/TootCrawler.py
+++ b/TootCrawler.py
@ -1,27 +1,87 @@
 from langdetect import detect
 import pytz
 import pandas as pd
+from pandas import DataFrame
 import re
 from SentiTooter import SentiTooter
 from pprint import pprint

 class TootCrawler():
+    """Class to fetch the recent toots from fedihum.org."""

-    def __init__(self, mastodonInstance) -> None:
+    def __init__(self, mastodonInstance: any) -> None:
+        """Initialize the Mastodon instance and depending classes.
+
+        Parameters
+        ------
+            mastodonInstance: any
+                The initialized Mastodon instance.
+        """
        self.mastodonInstance = mastodonInstance
        self.compilePattern = re.compile('<.*?>')
        self.sentiTooter = SentiTooter()
        self.localTimezone = pytz.timezone('Europe/Berlin')

-    def getLocalTimeline(self, minId=None):
+    def getLocalTimeline(self, minId=None) -> any:
+        """Receave the local timeline
+
+        Parameters
+        ------
+            minId: str | None
+                The last fetched toot id from the database.
+
+        Returns
+        ------
+            any
+                The local Mastodon timeline from fedihum.org.
+        """
        return self.mastodonInstance.timeline_local(min_id=minId, limit=500)

-    def cleanhtml(self, raw_html):
+    def cleanhtml(self, raw_html:str) -> str:
+        """remove brackets and http string from toots
+
+        Parameters
+        ------
+            raw_html: str
+            The toot content.
+        Returns
+        ------
+            str:
+            The cleaned toot content.
+        """
        cleantext = re.sub(self.compilePattern, '', raw_html)
        cleantext = re.sub(r'http\S+', '', cleantext)
        return cleantext

-    def buildTootsDataframe(self, minId=None):
+    def buildTootsDataframe(self, minId=None) -> DataFrame:
+        """Parse fetched toots from Mastodon to dataframe.
+
+        Parameters
+        ------
+            minId: str | None
+            The id of the last fetched toot.
+
+        Returns
+        ------
+            DataFrame
+            A Dataframe containing
+            sentiment: str
+                The sentiment (positive, neutral, negative)
+            model: str
+                The used sentiment model.
+            toot: str
+                The content of the toot.
+            datetime: datetime
+                The datetime of the toot.
+            language: str
+                The langage flag of the toot.
+            userName: str.
+                The user name of the toot.
+            userId: str
+                The user id.
+            tootId: str
+                The toot id.
+        """
        toots = []
        allTimelineResults = []
        timelinePagination = self.getLocalTimeline(minId)