add code documentation

2023-03-17 20:06:01 +01:00 · 2023-03-17 20:06:01 +01:00 · bc842244c7
commit bc842244c7
parent 4479bd2429
7 changed files with 261 additions and 31 deletions
--- a/CRUDManager.py
+++ b/CRUDManager.py
@ -3,7 +3,19 @@ import pandas as pd
 from sqlalchemy import desc, select, sql
 from Tables import Toots
 from pandas.core.api import (
    DataFrame)
 def calculateSentimentCount():
    """Calculates the frequencies of the sentiments.
    Returns
    -------
    DataFrame
        Containing date (YY-MM-DD), sentiment (positive, neutral, negative),
        and sentimentCount.
    """
    query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount
                FROM Toots
                GROUP BY DATE(datetime),
@ -16,7 +28,18 @@ def calculateSentimentCount():
        parse_dates=["datetime"]
    )
-def calculateSentimentMean(dataframe):
+def calculateSentimentMean(dataframe:DataFrame) -> DataFrame:
    """Calculates the mean of the sentiments.
    Parameters
    -------
        dataframe: DataFrame
    Returns
    -------
        Dataframe
        Containing date (YY-MM-DD), sentimentsMean.
    """
    negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1
    positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum()
    sentimentSum = dataframe['sentimentCount'].sum()
@ -31,7 +54,14 @@ def calculateSentimentMean(dataframe):
        ]
    )
-def getYesterdaysToots():
+def getYesterdaysToots() -> DataFrame:
    """Fetches yesterdays toots from database.
    Returns
    -------
        pd.Dataframe
        Containing date (YY-MM-DD), language, sentiment, toot.
    """
    query = f'''SELECT datetime as date, language, sentiment, toot
                FROM Toots
                WHERE datetime >= DATE("now","-1 day")
@ -43,16 +73,49 @@ def getYesterdaysToots():
    )
 class CRUDManager():
    """Class for database operations"""
-    def saveToDatabase(self, dataframe, table:str, useIndex=False):
+    def saveToDatabase(self, dataframe:DataFrame, table:str, useIndex=False):
        """Saves dataframe to database.
        Parameters
        -------
            dataframe: DataFrame
                Input dataframe.
            table: str
                Table, where to save the data.
            useIndex: boolean
                Should the index of the dataframe be used as index for
                the database table?
        """
        try:
            dataframe.to_sql(table, engine, index=useIndex, if_exists="append")
        except:
            print(f'Could not save data to {table}!')
-    def loadFromDatabase(self, table:str, indexColumn=None):
+    def loadFromDatabase(self, table:str, indexColumn=None) -> DataFrame:
        """Load a table into a dataframe.
        Parameters
        -------
            table: str
                Table, where to save the data.
            indexColumn: str | None
                Should the index of the table be used as index for
                the dataframe?
        Returns
        -------
            DataFrame
        """
        return pd.read_sql_table(table, connection, index_col=indexColumn)
-    def getLastToot(self):
+    def getLastToot(self) -> str:
        """Query the last toot id from database.
        Results
        -------
            str
            A toot id.
        """
        stmt = select(Toots.tootId).order_by(desc('datetime'))
        return session.scalars(stmt).first()
--- a/DbSetup.py
+++ b/DbSetup.py
@ -1,3 +1,7 @@
 """Script to initialize the database.
     Serves database url, engine, connection and session.
 """
 from sqlalchemy import create_engine
 from sqlalchemy.orm import Session
 from sqlalchemy.ext.declarative import declarative_base
@ -9,4 +13,6 @@ session = Session(engine)
 Base = declarative_base()
 def init_db():
     """Initialize the database.
     """
     Base.metadata.create_all(bind=engine)
--- a/Main.py
+++ b/Main.py
@ -1,3 +1,19 @@
 """
 Hedonodon toot sentiment analyzer.
 This programm fetches toots from the fedihum.org Mastodon instance, calculates
 the frequencies of the sentiments (positive, neutral, negative) and the mean
 from these nominal values (even this is not statistical correct (;-_-)!, but
 not all analyzer return compounds).
 It also calculates the word count of the nouns per sentiment.
 It uses germansentiment for german toots, twitter-roberta-base-sentiment for
 english toots, and vaderSentiment for other languages.
 For the word counts I translate the toots to english with the GoogleTranslator
 first.
 """
 from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots
 from datetime import datetime, date
 from DbSetup import init_db
@ -6,7 +22,7 @@ from MastodonAccountManager import MastodonAccountManager
 import matplotlib.pyplot as plt
 import matplotlib.dates as mdates
 from TootCrawler import TootCrawler
-from SentiTooter import translateToots, createWordCountPerSentiment
+from SentiTooter import translateToots, createWordFrequenciesPerSentiment
 locale.setlocale(locale.LC_TIME, "en_US.UTF-8")
 init_db()
@ -40,7 +56,7 @@ else:
 print('Calculate word counts...')
 yesterdaysToots = getYesterdaysToots()
 translatedToots = translateToots(yesterdaysToots)
-wordCountsPerSentiment = createWordCountPerSentiment(translatedToots)
+wordCountsPerSentiment = createWordFrequenciesPerSentiment(translatedToots)
 print('done!')
 print(wordCountsPerSentiment);
@ -116,6 +132,7 @@ plotFileUrl = f'./plots/{TodayDate}.png'
 plt.savefig(plotFileUrl)
 print('done!')
-media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.")
+print('Send toot...')
-mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en')
+#media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.")
-
+#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en')
 print('done!')
--- a/MastodonAccountManager.py
+++ b/MastodonAccountManager.py
@ -1,5 +1,7 @@
 from mastodon import Mastodon
 class MastodonAccountManager():
    """Initialize the Mastodon account.
    """
    def __init__(self):
        self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')
--- a/SentiTooter.py
+++ b/SentiTooter.py
@ -1,4 +1,5 @@
 from germansentiment import SentimentModel
 from pandas import DataFrame
 import numpy as np
 from scipy.special import softmax
 from transformers import AutoModelForSequenceClassification
@ -9,7 +10,18 @@ import spacy
 from collections import Counter
 # Preprocess text (username and link placeholders)
-def preprocess(text):
+def preprocess(text:str) -> str:
    """Removes tags and urls from text.
    Parameters
    ------
        text: str
        The raw toot from Mastodon.
    Returns
    ------
        str
        The cleaned text.
    """
    new_text = []
    for t in text.split(" "):
@ -20,9 +32,12 @@ def preprocess(text):
 class SentiTooter:
-    """"""
+    """Class to analyze the toots.
    """
    def __init__(self):
        """Initilize the sentiment models and labels.
        """
        self.deModel = SentimentModel()
        self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
        self.enModel, self.enTokenizer = self.initModel()
@ -30,7 +45,20 @@ class SentiTooter:
        self.labels = ['negative', 'neutral', 'positive']
        self.sia = SentimentIntensityAnalyzer()
-    def analyze(self, language, content):
+    def analyze(self, language:str, content:str) -> list[str, str, float]:
        """Analyzes the sentiments of the toots.
        Parameters
        ------
            language: str
            The language tag of the toot.
            content: str
            The toot content.
        Returns
        ------
            list[str, str, float]
            A list with the sentiment, analyzer type, and sentiment score.
        """
        match language:
            case 'de':
                sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
@ -61,6 +89,13 @@ class SentiTooter:
    def initModel(self):
        """Initialize the english models.
        Returns
        ------
            tupel
                The pretrained model and tokenizer.
        """
        # PT
        tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
        tokenizer.save_pretrained(self.enModelType)
@ -68,7 +103,14 @@ class SentiTooter:
        model.save_pretrained(self.enModelType)
        return model, tokenizer
-def translateToots(yesterdaysToots):
+def translateToots(yesterdaysToots:DataFrame) -> DataFrame:
    """Translates all toots to english.
    Returns
    ------
        Dataframe
        Containing the english translated toots.
    """
    yesterdaysTootsTranslated = yesterdaysToots
    for index, row in yesterdaysTootsTranslated.iterrows():
        if (row['language'] != 'en'):
@ -79,11 +121,39 @@ def translateToots(yesterdaysToots):
                yesterdaysTootsTranslated.drop(index)
    return yesterdaysTootsTranslated
-def translateToot(language, toot):
+def translateToot(language:str, toot:str) -> str:
    """Translate a toot in english.
    Parameters
    ------
        language:str
        The language of the toot.
        toot: str
        The toot content.
    Returns
    ------
        str
        The in english translated toot.
    """
    content = preprocess(toot)
    return GoogleTranslator(source=language, target='en').translate(content)
-def countWords(concatedToots, count):
+def countWords(concatedToots: str, number: int) -> list:
    """Counts the word frequencies in all toots of a given sentiment.
    Parameters
    ------
        concatedToots: str
        All toots from a sentiment.
        number: int
        Number of words to calculate word frequencies.
    Returns
    ------
        list
        List containing tuple of word and word frequency.
    """
    nlp = spacy.load('en_core_web_md')
    doc = nlp(concatedToots)
@ -96,18 +166,30 @@ def countWords(concatedToots, count):
    # five most common noun tokens
    noun_freq = Counter(nouns)
-    return noun_freq.most_common(count)
+    return noun_freq.most_common(number)
-def createWordCountPerSentiment(translatedToots):
+def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str:
    """Count all word frequencies of all toots per sentiment.
    Paramters
    ------
        translatedToots: DataFrame
        The dataframe with all toots in english.
    Returns
    ------
        str
        Containing words and wourd counts per sentiment.
    """
    sentimentList = []
    for sentiment in ['positive', 'neutral', 'negative']:
        tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot
-        wordCounts = countWords(tootsSeries.str.cat(sep=' '), 5)
+        wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5)
-        countList = []
+        FrequenciesList = []
-        for count in wordCounts:
+        for Frequencies in wordFrequencies:
-             countList.append(str(count[0]) + ' (' + str(count[1]) + ')')
+             FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')')
-        list2String = ', '.join(countList)
+        list2String = ', '.join(FrequenciesList)
        sentimentString = sentiment + ': ' + list2String
        sentimentList.append(sentimentString)
-    wordCountsPerSentiments = '\n'.join(sentimentList)
+    wordFrequenciessPerSentiments = '\n'.join(sentimentList)
-    return wordCountsPerSentiments
+    return wordFrequenciessPerSentiments
--- a/Tables.py
+++ b/Tables.py
@ -1,3 +1,5 @@
 """This script containing the table definitions for the database."""
 from DbSetup import Base
 from sqlalchemy import Column, Date, Integer, Float, String
@ -14,8 +16,6 @@ class Toots(Base):
    userName = Column(String(255))
    userId = Column(String(255))
 class SentimentCounts(Base):
    __tablename__ = 'SentimentCounts'
    __table_args__ = {'extend_existing': True}
--- a/TootCrawler.py
+++ b/TootCrawler.py
@ -1,27 +1,87 @@
 from langdetect import detect
 import pytz
 import pandas as pd
 from pandas import DataFrame
 import re
 from SentiTooter import SentiTooter
 from pprint import pprint
 class TootCrawler():
    """Class to fetch the recent toots from fedihum.org."""
-    def __init__(self, mastodonInstance) -> None:
+    def __init__(self, mastodonInstance: any) -> None:
        """Initialize the Mastodon instance and depending classes.
        Parameters
        ------
            mastodonInstance: any
                The initialized Mastodon instance.
        """
        self.mastodonInstance = mastodonInstance
        self.compilePattern = re.compile('<.*?>')
        self.sentiTooter = SentiTooter()
        self.localTimezone = pytz.timezone('Europe/Berlin')
-    def getLocalTimeline(self, minId=None):
+    def getLocalTimeline(self, minId=None) -> any:
        """Receave the local timeline
        Parameters
        ------
            minId: str | None
                The last fetched toot id from the database.
        Returns
        ------
            any
                The local Mastodon timeline from fedihum.org.
        """
        return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
-    def cleanhtml(self, raw_html):
+    def cleanhtml(self, raw_html:str) -> str:
        """remove brackets and http string from toots
        Parameters
        ------
            raw_html: str
            The toot content.
        Returns
        ------
            str:
            The cleaned toot content.
        """
        cleantext = re.sub(self.compilePattern, '', raw_html)
        cleantext = re.sub(r'http\S+', '', cleantext)
        return cleantext
-    def buildTootsDataframe(self, minId=None):
+    def buildTootsDataframe(self, minId=None) -> DataFrame:
        """Parse fetched toots from Mastodon to dataframe.
        Parameters
        ------
            minId: str | None
            The id of the last fetched toot.
        Returns
        ------
            DataFrame
            A Dataframe containing
            sentiment: str
                The sentiment (positive, neutral, negative)
            model: str
                The used sentiment model.
            toot: str
                The content of the toot.
            datetime: datetime
                The datetime of the toot.
            language: str
                The langage flag of the toot.
            userName: str.
                The user name of the toot.
            userId: str
                The user id.
            tootId: str
                The toot id.
        """
        toots = []
        allTimelineResults = []
        timelinePagination = self.getLocalTimeline(minId)