diff --git a/CRUDManager.py b/CRUDManager.py index dccdf00..cd0c7e0 100644 --- a/CRUDManager.py +++ b/CRUDManager.py @@ -3,7 +3,19 @@ import pandas as pd from sqlalchemy import desc, select, sql from Tables import Toots +from pandas.core.api import ( + DataFrame) + def calculateSentimentCount(): + """Calculates the frequencies of the sentiments. + + Returns + ------- + DataFrame + Containing date (YY-MM-DD), sentiment (positive, neutral, negative), + and sentimentCount. + """ + query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount FROM Toots GROUP BY DATE(datetime), @@ -16,7 +28,18 @@ def calculateSentimentCount(): parse_dates=["datetime"] ) -def calculateSentimentMean(dataframe): +def calculateSentimentMean(dataframe:DataFrame) -> DataFrame: + """Calculates the mean of the sentiments. + + Parameters + ------- + dataframe: DataFrame + + Returns + ------- + Dataframe + Containing date (YY-MM-DD), sentimentsMean. + """ negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1 positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum() sentimentSum = dataframe['sentimentCount'].sum() @@ -31,7 +54,14 @@ def calculateSentimentMean(dataframe): ] ) -def getYesterdaysToots(): +def getYesterdaysToots() -> DataFrame: + """Fetches yesterdays toots from database. + + Returns + ------- + pd.Dataframe + Containing date (YY-MM-DD), language, sentiment, toot. + """ query = f'''SELECT datetime as date, language, sentiment, toot FROM Toots WHERE datetime >= DATE("now","-1 day") @@ -43,16 +73,49 @@ def getYesterdaysToots(): ) class CRUDManager(): + """Class for database operations""" - def saveToDatabase(self, dataframe, table:str, useIndex=False): + def saveToDatabase(self, dataframe:DataFrame, table:str, useIndex=False): + """Saves dataframe to database. + + Parameters + ------- + dataframe: DataFrame + Input dataframe. + table: str + Table, where to save the data. + useIndex: boolean + Should the index of the dataframe be used as index for + the database table? + """ try: dataframe.to_sql(table, engine, index=useIndex, if_exists="append") except: print(f'Could not save data to {table}!') - def loadFromDatabase(self, table:str, indexColumn=None): + def loadFromDatabase(self, table:str, indexColumn=None) -> DataFrame: + """Load a table into a dataframe. + + Parameters + ------- + table: str + Table, where to save the data. + indexColumn: str | None + Should the index of the table be used as index for + the dataframe? + Returns + ------- + DataFrame + """ return pd.read_sql_table(table, connection, index_col=indexColumn) - def getLastToot(self): + def getLastToot(self) -> str: + """Query the last toot id from database. + + Results + ------- + str + A toot id. + """ stmt = select(Toots.tootId).order_by(desc('datetime')) return session.scalars(stmt).first() diff --git a/DbSetup.py b/DbSetup.py index c787928..b6179f7 100644 --- a/DbSetup.py +++ b/DbSetup.py @@ -1,3 +1,7 @@ +"""Script to initialize the database. + Serves database url, engine, connection and session. +""" + from sqlalchemy import create_engine from sqlalchemy.orm import Session from sqlalchemy.ext.declarative import declarative_base @@ -9,4 +13,6 @@ session = Session(engine) Base = declarative_base() def init_db(): + """Initialize the database. + """ Base.metadata.create_all(bind=engine) diff --git a/Main.py b/Main.py index 2e7908f..5bb6532 100644 --- a/Main.py +++ b/Main.py @@ -1,3 +1,19 @@ +""" +Hedonodon toot sentiment analyzer. + +This programm fetches toots from the fedihum.org Mastodon instance, calculates +the frequencies of the sentiments (positive, neutral, negative) and the mean +from these nominal values (even this is not statistical correct (;-_-)!, but +not all analyzer return compounds). +It also calculates the word count of the nouns per sentiment. + +It uses germansentiment for german toots, twitter-roberta-base-sentiment for +english toots, and vaderSentiment for other languages. + +For the word counts I translate the toots to english with the GoogleTranslator +first. +""" + from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots from datetime import datetime, date from DbSetup import init_db @@ -6,7 +22,7 @@ from MastodonAccountManager import MastodonAccountManager import matplotlib.pyplot as plt import matplotlib.dates as mdates from TootCrawler import TootCrawler -from SentiTooter import translateToots, createWordCountPerSentiment +from SentiTooter import translateToots, createWordFrequenciesPerSentiment locale.setlocale(locale.LC_TIME, "en_US.UTF-8") init_db() @@ -40,7 +56,7 @@ else: print('Calculate word counts...') yesterdaysToots = getYesterdaysToots() translatedToots = translateToots(yesterdaysToots) -wordCountsPerSentiment = createWordCountPerSentiment(translatedToots) +wordCountsPerSentiment = createWordFrequenciesPerSentiment(translatedToots) print('done!') print(wordCountsPerSentiment); @@ -116,6 +132,7 @@ plotFileUrl = f'./plots/{TodayDate}.png' plt.savefig(plotFileUrl) print('done!') -media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.") -mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en') - +print('Send toot...') +#media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.") +#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en') +print('done!') diff --git a/MastodonAccountManager.py b/MastodonAccountManager.py index 68d62bb..957ac4e 100644 --- a/MastodonAccountManager.py +++ b/MastodonAccountManager.py @@ -1,5 +1,7 @@ from mastodon import Mastodon class MastodonAccountManager(): + """Initialize the Mastodon account. + """ def __init__(self): self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret') diff --git a/SentiTooter.py b/SentiTooter.py index 28a5623..b5c1d6a 100644 --- a/SentiTooter.py +++ b/SentiTooter.py @@ -1,4 +1,5 @@ from germansentiment import SentimentModel +from pandas import DataFrame import numpy as np from scipy.special import softmax from transformers import AutoModelForSequenceClassification @@ -9,7 +10,18 @@ import spacy from collections import Counter # Preprocess text (username and link placeholders) -def preprocess(text): +def preprocess(text:str) -> str: + """Removes tags and urls from text. + + Parameters + ------ + text: str + The raw toot from Mastodon. + Returns + ------ + str + The cleaned text. + """ new_text = [] for t in text.split(" "): @@ -20,9 +32,12 @@ def preprocess(text): class SentiTooter: - """""" + """Class to analyze the toots. + """ def __init__(self): + """Initilize the sentiment models and labels. + """ self.deModel = SentimentModel() self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment" self.enModel, self.enTokenizer = self.initModel() @@ -30,7 +45,20 @@ class SentiTooter: self.labels = ['negative', 'neutral', 'positive'] self.sia = SentimentIntensityAnalyzer() - def analyze(self, language, content): + def analyze(self, language:str, content:str) -> list[str, str, float]: + """Analyzes the sentiments of the toots. + + Parameters + ------ + language: str + The language tag of the toot. + content: str + The toot content. + Returns + ------ + list[str, str, float] + A list with the sentiment, analyzer type, and sentiment score. + """ match language: case 'de': sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True) @@ -61,6 +89,13 @@ class SentiTooter: def initModel(self): + """Initialize the english models. + + Returns + ------ + tupel + The pretrained model and tokenizer. + """ # PT tokenizer = AutoTokenizer.from_pretrained(self.enModelType) tokenizer.save_pretrained(self.enModelType) @@ -68,7 +103,14 @@ class SentiTooter: model.save_pretrained(self.enModelType) return model, tokenizer -def translateToots(yesterdaysToots): +def translateToots(yesterdaysToots:DataFrame) -> DataFrame: + """Translates all toots to english. + + Returns + ------ + Dataframe + Containing the english translated toots. + """ yesterdaysTootsTranslated = yesterdaysToots for index, row in yesterdaysTootsTranslated.iterrows(): if (row['language'] != 'en'): @@ -79,11 +121,39 @@ def translateToots(yesterdaysToots): yesterdaysTootsTranslated.drop(index) return yesterdaysTootsTranslated -def translateToot(language, toot): +def translateToot(language:str, toot:str) -> str: + """Translate a toot in english. + + Parameters + ------ + language:str + The language of the toot. + toot: str + The toot content. + + Returns + ------ + str + The in english translated toot. + """ content = preprocess(toot) return GoogleTranslator(source=language, target='en').translate(content) -def countWords(concatedToots, count): +def countWords(concatedToots: str, number: int) -> list: + """Counts the word frequencies in all toots of a given sentiment. + + Parameters + ------ + concatedToots: str + All toots from a sentiment. + number: int + Number of words to calculate word frequencies. + + Returns + ------ + list + List containing tuple of word and word frequency. + """ nlp = spacy.load('en_core_web_md') doc = nlp(concatedToots) @@ -96,18 +166,30 @@ def countWords(concatedToots, count): # five most common noun tokens noun_freq = Counter(nouns) - return noun_freq.most_common(count) + return noun_freq.most_common(number) -def createWordCountPerSentiment(translatedToots): +def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str: + """Count all word frequencies of all toots per sentiment. + + Paramters + ------ + translatedToots: DataFrame + The dataframe with all toots in english. + + Returns + ------ + str + Containing words and wourd counts per sentiment. + """ sentimentList = [] for sentiment in ['positive', 'neutral', 'negative']: tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot - wordCounts = countWords(tootsSeries.str.cat(sep=' '), 5) - countList = [] - for count in wordCounts: - countList.append(str(count[0]) + ' (' + str(count[1]) + ')') - list2String = ', '.join(countList) + wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5) + FrequenciesList = [] + for Frequencies in wordFrequencies: + FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')') + list2String = ', '.join(FrequenciesList) sentimentString = sentiment + ': ' + list2String sentimentList.append(sentimentString) - wordCountsPerSentiments = '\n'.join(sentimentList) - return wordCountsPerSentiments \ No newline at end of file + wordFrequenciessPerSentiments = '\n'.join(sentimentList) + return wordFrequenciessPerSentiments \ No newline at end of file diff --git a/Tables.py b/Tables.py index c64178a..071179a 100644 --- a/Tables.py +++ b/Tables.py @@ -1,3 +1,5 @@ +"""This script containing the table definitions for the database.""" + from DbSetup import Base from sqlalchemy import Column, Date, Integer, Float, String @@ -14,8 +16,6 @@ class Toots(Base): userName = Column(String(255)) userId = Column(String(255)) - - class SentimentCounts(Base): __tablename__ = 'SentimentCounts' __table_args__ = {'extend_existing': True} diff --git a/TootCrawler.py b/TootCrawler.py index a657a5d..67cd722 100644 --- a/TootCrawler.py +++ b/TootCrawler.py @@ -1,27 +1,87 @@ from langdetect import detect import pytz import pandas as pd +from pandas import DataFrame import re from SentiTooter import SentiTooter from pprint import pprint class TootCrawler(): + """Class to fetch the recent toots from fedihum.org.""" - def __init__(self, mastodonInstance) -> None: + def __init__(self, mastodonInstance: any) -> None: + """Initialize the Mastodon instance and depending classes. + + Parameters + ------ + mastodonInstance: any + The initialized Mastodon instance. + """ self.mastodonInstance = mastodonInstance self.compilePattern = re.compile('<.*?>') self.sentiTooter = SentiTooter() self.localTimezone = pytz.timezone('Europe/Berlin') - def getLocalTimeline(self, minId=None): + def getLocalTimeline(self, minId=None) -> any: + """Receave the local timeline + + Parameters + ------ + minId: str | None + The last fetched toot id from the database. + + Returns + ------ + any + The local Mastodon timeline from fedihum.org. + """ return self.mastodonInstance.timeline_local(min_id=minId, limit=500) - def cleanhtml(self, raw_html): + def cleanhtml(self, raw_html:str) -> str: + """remove brackets and http string from toots + + Parameters + ------ + raw_html: str + The toot content. + Returns + ------ + str: + The cleaned toot content. + """ cleantext = re.sub(self.compilePattern, '', raw_html) cleantext = re.sub(r'http\S+', '', cleantext) return cleantext - def buildTootsDataframe(self, minId=None): + def buildTootsDataframe(self, minId=None) -> DataFrame: + """Parse fetched toots from Mastodon to dataframe. + + Parameters + ------ + minId: str | None + The id of the last fetched toot. + + Returns + ------ + DataFrame + A Dataframe containing + sentiment: str + The sentiment (positive, neutral, negative) + model: str + The used sentiment model. + toot: str + The content of the toot. + datetime: datetime + The datetime of the toot. + language: str + The langage flag of the toot. + userName: str. + The user name of the toot. + userId: str + The user id. + tootId: str + The toot id. + """ toots = [] allTimelineResults = [] timelinePagination = self.getLocalTimeline(minId)