add code documentation

This commit is contained in:
rnsrk 2023-03-17 20:06:01 +01:00
parent 4479bd2429
commit bc842244c7
7 changed files with 261 additions and 31 deletions

View file

@ -3,7 +3,19 @@ import pandas as pd
from sqlalchemy import desc, select, sql from sqlalchemy import desc, select, sql
from Tables import Toots from Tables import Toots
from pandas.core.api import (
DataFrame)
def calculateSentimentCount(): def calculateSentimentCount():
"""Calculates the frequencies of the sentiments.
Returns
-------
DataFrame
Containing date (YY-MM-DD), sentiment (positive, neutral, negative),
and sentimentCount.
"""
query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount
FROM Toots FROM Toots
GROUP BY DATE(datetime), GROUP BY DATE(datetime),
@ -16,7 +28,18 @@ def calculateSentimentCount():
parse_dates=["datetime"] parse_dates=["datetime"]
) )
def calculateSentimentMean(dataframe): def calculateSentimentMean(dataframe:DataFrame) -> DataFrame:
"""Calculates the mean of the sentiments.
Parameters
-------
dataframe: DataFrame
Returns
-------
Dataframe
Containing date (YY-MM-DD), sentimentsMean.
"""
negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1 negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1
positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum() positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum()
sentimentSum = dataframe['sentimentCount'].sum() sentimentSum = dataframe['sentimentCount'].sum()
@ -31,7 +54,14 @@ def calculateSentimentMean(dataframe):
] ]
) )
def getYesterdaysToots(): def getYesterdaysToots() -> DataFrame:
"""Fetches yesterdays toots from database.
Returns
-------
pd.Dataframe
Containing date (YY-MM-DD), language, sentiment, toot.
"""
query = f'''SELECT datetime as date, language, sentiment, toot query = f'''SELECT datetime as date, language, sentiment, toot
FROM Toots FROM Toots
WHERE datetime >= DATE("now","-1 day") WHERE datetime >= DATE("now","-1 day")
@ -43,16 +73,49 @@ def getYesterdaysToots():
) )
class CRUDManager(): class CRUDManager():
"""Class for database operations"""
def saveToDatabase(self, dataframe, table:str, useIndex=False): def saveToDatabase(self, dataframe:DataFrame, table:str, useIndex=False):
"""Saves dataframe to database.
Parameters
-------
dataframe: DataFrame
Input dataframe.
table: str
Table, where to save the data.
useIndex: boolean
Should the index of the dataframe be used as index for
the database table?
"""
try: try:
dataframe.to_sql(table, engine, index=useIndex, if_exists="append") dataframe.to_sql(table, engine, index=useIndex, if_exists="append")
except: except:
print(f'Could not save data to {table}!') print(f'Could not save data to {table}!')
def loadFromDatabase(self, table:str, indexColumn=None): def loadFromDatabase(self, table:str, indexColumn=None) -> DataFrame:
"""Load a table into a dataframe.
Parameters
-------
table: str
Table, where to save the data.
indexColumn: str | None
Should the index of the table be used as index for
the dataframe?
Returns
-------
DataFrame
"""
return pd.read_sql_table(table, connection, index_col=indexColumn) return pd.read_sql_table(table, connection, index_col=indexColumn)
def getLastToot(self): def getLastToot(self) -> str:
"""Query the last toot id from database.
Results
-------
str
A toot id.
"""
stmt = select(Toots.tootId).order_by(desc('datetime')) stmt = select(Toots.tootId).order_by(desc('datetime'))
return session.scalars(stmt).first() return session.scalars(stmt).first()

View file

@ -1,3 +1,7 @@
"""Script to initialize the database.
Serves database url, engine, connection and session.
"""
from sqlalchemy import create_engine from sqlalchemy import create_engine
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
@ -9,4 +13,6 @@ session = Session(engine)
Base = declarative_base() Base = declarative_base()
def init_db(): def init_db():
"""Initialize the database.
"""
Base.metadata.create_all(bind=engine) Base.metadata.create_all(bind=engine)

27
Main.py
View file

@ -1,3 +1,19 @@
"""
Hedonodon toot sentiment analyzer.
This programm fetches toots from the fedihum.org Mastodon instance, calculates
the frequencies of the sentiments (positive, neutral, negative) and the mean
from these nominal values (even this is not statistical correct (;-_-)!, but
not all analyzer return compounds).
It also calculates the word count of the nouns per sentiment.
It uses germansentiment for german toots, twitter-roberta-base-sentiment for
english toots, and vaderSentiment for other languages.
For the word counts I translate the toots to english with the GoogleTranslator
first.
"""
from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots
from datetime import datetime, date from datetime import datetime, date
from DbSetup import init_db from DbSetup import init_db
@ -6,7 +22,7 @@ from MastodonAccountManager import MastodonAccountManager
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.dates as mdates import matplotlib.dates as mdates
from TootCrawler import TootCrawler from TootCrawler import TootCrawler
from SentiTooter import translateToots, createWordCountPerSentiment from SentiTooter import translateToots, createWordFrequenciesPerSentiment
locale.setlocale(locale.LC_TIME, "en_US.UTF-8") locale.setlocale(locale.LC_TIME, "en_US.UTF-8")
init_db() init_db()
@ -40,7 +56,7 @@ else:
print('Calculate word counts...') print('Calculate word counts...')
yesterdaysToots = getYesterdaysToots() yesterdaysToots = getYesterdaysToots()
translatedToots = translateToots(yesterdaysToots) translatedToots = translateToots(yesterdaysToots)
wordCountsPerSentiment = createWordCountPerSentiment(translatedToots) wordCountsPerSentiment = createWordFrequenciesPerSentiment(translatedToots)
print('done!') print('done!')
print(wordCountsPerSentiment); print(wordCountsPerSentiment);
@ -116,6 +132,7 @@ plotFileUrl = f'./plots/{TodayDate}.png'
plt.savefig(plotFileUrl) plt.savefig(plotFileUrl)
print('done!') print('done!')
media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.") print('Send toot...')
mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en') #media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.")
#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en')
print('done!')

View file

@ -1,5 +1,7 @@
from mastodon import Mastodon from mastodon import Mastodon
class MastodonAccountManager(): class MastodonAccountManager():
"""Initialize the Mastodon account.
"""
def __init__(self): def __init__(self):
self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret') self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')

View file

@ -1,4 +1,5 @@
from germansentiment import SentimentModel from germansentiment import SentimentModel
from pandas import DataFrame
import numpy as np import numpy as np
from scipy.special import softmax from scipy.special import softmax
from transformers import AutoModelForSequenceClassification from transformers import AutoModelForSequenceClassification
@ -9,7 +10,18 @@ import spacy
from collections import Counter from collections import Counter
# Preprocess text (username and link placeholders) # Preprocess text (username and link placeholders)
def preprocess(text): def preprocess(text:str) -> str:
"""Removes tags and urls from text.
Parameters
------
text: str
The raw toot from Mastodon.
Returns
------
str
The cleaned text.
"""
new_text = [] new_text = []
for t in text.split(" "): for t in text.split(" "):
@ -20,9 +32,12 @@ def preprocess(text):
class SentiTooter: class SentiTooter:
"""""" """Class to analyze the toots.
"""
def __init__(self): def __init__(self):
"""Initilize the sentiment models and labels.
"""
self.deModel = SentimentModel() self.deModel = SentimentModel()
self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment" self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
self.enModel, self.enTokenizer = self.initModel() self.enModel, self.enTokenizer = self.initModel()
@ -30,7 +45,20 @@ class SentiTooter:
self.labels = ['negative', 'neutral', 'positive'] self.labels = ['negative', 'neutral', 'positive']
self.sia = SentimentIntensityAnalyzer() self.sia = SentimentIntensityAnalyzer()
def analyze(self, language, content): def analyze(self, language:str, content:str) -> list[str, str, float]:
"""Analyzes the sentiments of the toots.
Parameters
------
language: str
The language tag of the toot.
content: str
The toot content.
Returns
------
list[str, str, float]
A list with the sentiment, analyzer type, and sentiment score.
"""
match language: match language:
case 'de': case 'de':
sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True) sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
@ -61,6 +89,13 @@ class SentiTooter:
def initModel(self): def initModel(self):
"""Initialize the english models.
Returns
------
tupel
The pretrained model and tokenizer.
"""
# PT # PT
tokenizer = AutoTokenizer.from_pretrained(self.enModelType) tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
tokenizer.save_pretrained(self.enModelType) tokenizer.save_pretrained(self.enModelType)
@ -68,7 +103,14 @@ class SentiTooter:
model.save_pretrained(self.enModelType) model.save_pretrained(self.enModelType)
return model, tokenizer return model, tokenizer
def translateToots(yesterdaysToots): def translateToots(yesterdaysToots:DataFrame) -> DataFrame:
"""Translates all toots to english.
Returns
------
Dataframe
Containing the english translated toots.
"""
yesterdaysTootsTranslated = yesterdaysToots yesterdaysTootsTranslated = yesterdaysToots
for index, row in yesterdaysTootsTranslated.iterrows(): for index, row in yesterdaysTootsTranslated.iterrows():
if (row['language'] != 'en'): if (row['language'] != 'en'):
@ -79,11 +121,39 @@ def translateToots(yesterdaysToots):
yesterdaysTootsTranslated.drop(index) yesterdaysTootsTranslated.drop(index)
return yesterdaysTootsTranslated return yesterdaysTootsTranslated
def translateToot(language, toot): def translateToot(language:str, toot:str) -> str:
"""Translate a toot in english.
Parameters
------
language:str
The language of the toot.
toot: str
The toot content.
Returns
------
str
The in english translated toot.
"""
content = preprocess(toot) content = preprocess(toot)
return GoogleTranslator(source=language, target='en').translate(content) return GoogleTranslator(source=language, target='en').translate(content)
def countWords(concatedToots, count): def countWords(concatedToots: str, number: int) -> list:
"""Counts the word frequencies in all toots of a given sentiment.
Parameters
------
concatedToots: str
All toots from a sentiment.
number: int
Number of words to calculate word frequencies.
Returns
------
list
List containing tuple of word and word frequency.
"""
nlp = spacy.load('en_core_web_md') nlp = spacy.load('en_core_web_md')
doc = nlp(concatedToots) doc = nlp(concatedToots)
@ -96,18 +166,30 @@ def countWords(concatedToots, count):
# five most common noun tokens # five most common noun tokens
noun_freq = Counter(nouns) noun_freq = Counter(nouns)
return noun_freq.most_common(count) return noun_freq.most_common(number)
def createWordCountPerSentiment(translatedToots): def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str:
"""Count all word frequencies of all toots per sentiment.
Paramters
------
translatedToots: DataFrame
The dataframe with all toots in english.
Returns
------
str
Containing words and wourd counts per sentiment.
"""
sentimentList = [] sentimentList = []
for sentiment in ['positive', 'neutral', 'negative']: for sentiment in ['positive', 'neutral', 'negative']:
tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot
wordCounts = countWords(tootsSeries.str.cat(sep=' '), 5) wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5)
countList = [] FrequenciesList = []
for count in wordCounts: for Frequencies in wordFrequencies:
countList.append(str(count[0]) + ' (' + str(count[1]) + ')') FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')')
list2String = ', '.join(countList) list2String = ', '.join(FrequenciesList)
sentimentString = sentiment + ': ' + list2String sentimentString = sentiment + ': ' + list2String
sentimentList.append(sentimentString) sentimentList.append(sentimentString)
wordCountsPerSentiments = '\n'.join(sentimentList) wordFrequenciessPerSentiments = '\n'.join(sentimentList)
return wordCountsPerSentiments return wordFrequenciessPerSentiments

View file

@ -1,3 +1,5 @@
"""This script containing the table definitions for the database."""
from DbSetup import Base from DbSetup import Base
from sqlalchemy import Column, Date, Integer, Float, String from sqlalchemy import Column, Date, Integer, Float, String
@ -14,8 +16,6 @@ class Toots(Base):
userName = Column(String(255)) userName = Column(String(255))
userId = Column(String(255)) userId = Column(String(255))
class SentimentCounts(Base): class SentimentCounts(Base):
__tablename__ = 'SentimentCounts' __tablename__ = 'SentimentCounts'
__table_args__ = {'extend_existing': True} __table_args__ = {'extend_existing': True}

View file

@ -1,27 +1,87 @@
from langdetect import detect from langdetect import detect
import pytz import pytz
import pandas as pd import pandas as pd
from pandas import DataFrame
import re import re
from SentiTooter import SentiTooter from SentiTooter import SentiTooter
from pprint import pprint from pprint import pprint
class TootCrawler(): class TootCrawler():
"""Class to fetch the recent toots from fedihum.org."""
def __init__(self, mastodonInstance) -> None: def __init__(self, mastodonInstance: any) -> None:
"""Initialize the Mastodon instance and depending classes.
Parameters
------
mastodonInstance: any
The initialized Mastodon instance.
"""
self.mastodonInstance = mastodonInstance self.mastodonInstance = mastodonInstance
self.compilePattern = re.compile('<.*?>') self.compilePattern = re.compile('<.*?>')
self.sentiTooter = SentiTooter() self.sentiTooter = SentiTooter()
self.localTimezone = pytz.timezone('Europe/Berlin') self.localTimezone = pytz.timezone('Europe/Berlin')
def getLocalTimeline(self, minId=None): def getLocalTimeline(self, minId=None) -> any:
"""Receave the local timeline
Parameters
------
minId: str | None
The last fetched toot id from the database.
Returns
------
any
The local Mastodon timeline from fedihum.org.
"""
return self.mastodonInstance.timeline_local(min_id=minId, limit=500) return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
def cleanhtml(self, raw_html): def cleanhtml(self, raw_html:str) -> str:
"""remove brackets and http string from toots
Parameters
------
raw_html: str
The toot content.
Returns
------
str:
The cleaned toot content.
"""
cleantext = re.sub(self.compilePattern, '', raw_html) cleantext = re.sub(self.compilePattern, '', raw_html)
cleantext = re.sub(r'http\S+', '', cleantext) cleantext = re.sub(r'http\S+', '', cleantext)
return cleantext return cleantext
def buildTootsDataframe(self, minId=None): def buildTootsDataframe(self, minId=None) -> DataFrame:
"""Parse fetched toots from Mastodon to dataframe.
Parameters
------
minId: str | None
The id of the last fetched toot.
Returns
------
DataFrame
A Dataframe containing
sentiment: str
The sentiment (positive, neutral, negative)
model: str
The used sentiment model.
toot: str
The content of the toot.
datetime: datetime
The datetime of the toot.
language: str
The langage flag of the toot.
userName: str.
The user name of the toot.
userId: str
The user id.
tootId: str
The toot id.
"""
toots = [] toots = []
allTimelineResults = [] allTimelineResults = []
timelinePagination = self.getLocalTimeline(minId) timelinePagination = self.getLocalTimeline(minId)