add code documentation
This commit is contained in:
parent
4479bd2429
commit
bc842244c7
7 changed files with 261 additions and 31 deletions
|
|
@ -3,7 +3,19 @@ import pandas as pd
|
|||
from sqlalchemy import desc, select, sql
|
||||
from Tables import Toots
|
||||
|
||||
from pandas.core.api import (
|
||||
DataFrame)
|
||||
|
||||
def calculateSentimentCount():
|
||||
"""Calculates the frequencies of the sentiments.
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
Containing date (YY-MM-DD), sentiment (positive, neutral, negative),
|
||||
and sentimentCount.
|
||||
"""
|
||||
|
||||
query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount
|
||||
FROM Toots
|
||||
GROUP BY DATE(datetime),
|
||||
|
|
@ -16,7 +28,18 @@ def calculateSentimentCount():
|
|||
parse_dates=["datetime"]
|
||||
)
|
||||
|
||||
def calculateSentimentMean(dataframe):
|
||||
def calculateSentimentMean(dataframe:DataFrame) -> DataFrame:
|
||||
"""Calculates the mean of the sentiments.
|
||||
|
||||
Parameters
|
||||
-------
|
||||
dataframe: DataFrame
|
||||
|
||||
Returns
|
||||
-------
|
||||
Dataframe
|
||||
Containing date (YY-MM-DD), sentimentsMean.
|
||||
"""
|
||||
negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1
|
||||
positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum()
|
||||
sentimentSum = dataframe['sentimentCount'].sum()
|
||||
|
|
@ -31,7 +54,14 @@ def calculateSentimentMean(dataframe):
|
|||
]
|
||||
)
|
||||
|
||||
def getYesterdaysToots():
|
||||
def getYesterdaysToots() -> DataFrame:
|
||||
"""Fetches yesterdays toots from database.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pd.Dataframe
|
||||
Containing date (YY-MM-DD), language, sentiment, toot.
|
||||
"""
|
||||
query = f'''SELECT datetime as date, language, sentiment, toot
|
||||
FROM Toots
|
||||
WHERE datetime >= DATE("now","-1 day")
|
||||
|
|
@ -43,16 +73,49 @@ def getYesterdaysToots():
|
|||
)
|
||||
|
||||
class CRUDManager():
|
||||
"""Class for database operations"""
|
||||
|
||||
def saveToDatabase(self, dataframe, table:str, useIndex=False):
|
||||
def saveToDatabase(self, dataframe:DataFrame, table:str, useIndex=False):
|
||||
"""Saves dataframe to database.
|
||||
|
||||
Parameters
|
||||
-------
|
||||
dataframe: DataFrame
|
||||
Input dataframe.
|
||||
table: str
|
||||
Table, where to save the data.
|
||||
useIndex: boolean
|
||||
Should the index of the dataframe be used as index for
|
||||
the database table?
|
||||
"""
|
||||
try:
|
||||
dataframe.to_sql(table, engine, index=useIndex, if_exists="append")
|
||||
except:
|
||||
print(f'Could not save data to {table}!')
|
||||
|
||||
def loadFromDatabase(self, table:str, indexColumn=None):
|
||||
def loadFromDatabase(self, table:str, indexColumn=None) -> DataFrame:
|
||||
"""Load a table into a dataframe.
|
||||
|
||||
Parameters
|
||||
-------
|
||||
table: str
|
||||
Table, where to save the data.
|
||||
indexColumn: str | None
|
||||
Should the index of the table be used as index for
|
||||
the dataframe?
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
"""
|
||||
return pd.read_sql_table(table, connection, index_col=indexColumn)
|
||||
|
||||
def getLastToot(self):
|
||||
def getLastToot(self) -> str:
|
||||
"""Query the last toot id from database.
|
||||
|
||||
Results
|
||||
-------
|
||||
str
|
||||
A toot id.
|
||||
"""
|
||||
stmt = select(Toots.tootId).order_by(desc('datetime'))
|
||||
return session.scalars(stmt).first()
|
||||
|
|
|
|||
|
|
@ -1,3 +1,7 @@
|
|||
"""Script to initialize the database.
|
||||
Serves database url, engine, connection and session.
|
||||
"""
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
|
@ -9,4 +13,6 @@ session = Session(engine)
|
|||
Base = declarative_base()
|
||||
|
||||
def init_db():
|
||||
"""Initialize the database.
|
||||
"""
|
||||
Base.metadata.create_all(bind=engine)
|
||||
|
|
|
|||
27
Main.py
27
Main.py
|
|
@ -1,3 +1,19 @@
|
|||
"""
|
||||
Hedonodon toot sentiment analyzer.
|
||||
|
||||
This programm fetches toots from the fedihum.org Mastodon instance, calculates
|
||||
the frequencies of the sentiments (positive, neutral, negative) and the mean
|
||||
from these nominal values (even this is not statistical correct (;-_-)!, but
|
||||
not all analyzer return compounds).
|
||||
It also calculates the word count of the nouns per sentiment.
|
||||
|
||||
It uses germansentiment for german toots, twitter-roberta-base-sentiment for
|
||||
english toots, and vaderSentiment for other languages.
|
||||
|
||||
For the word counts I translate the toots to english with the GoogleTranslator
|
||||
first.
|
||||
"""
|
||||
|
||||
from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots
|
||||
from datetime import datetime, date
|
||||
from DbSetup import init_db
|
||||
|
|
@ -6,7 +22,7 @@ from MastodonAccountManager import MastodonAccountManager
|
|||
import matplotlib.pyplot as plt
|
||||
import matplotlib.dates as mdates
|
||||
from TootCrawler import TootCrawler
|
||||
from SentiTooter import translateToots, createWordCountPerSentiment
|
||||
from SentiTooter import translateToots, createWordFrequenciesPerSentiment
|
||||
|
||||
locale.setlocale(locale.LC_TIME, "en_US.UTF-8")
|
||||
init_db()
|
||||
|
|
@ -40,7 +56,7 @@ else:
|
|||
print('Calculate word counts...')
|
||||
yesterdaysToots = getYesterdaysToots()
|
||||
translatedToots = translateToots(yesterdaysToots)
|
||||
wordCountsPerSentiment = createWordCountPerSentiment(translatedToots)
|
||||
wordCountsPerSentiment = createWordFrequenciesPerSentiment(translatedToots)
|
||||
print('done!')
|
||||
|
||||
print(wordCountsPerSentiment);
|
||||
|
|
@ -116,6 +132,7 @@ plotFileUrl = f'./plots/{TodayDate}.png'
|
|||
plt.savefig(plotFileUrl)
|
||||
print('done!')
|
||||
|
||||
media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.")
|
||||
mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en')
|
||||
|
||||
print('Send toot...')
|
||||
#media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.")
|
||||
#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en')
|
||||
print('done!')
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
from mastodon import Mastodon
|
||||
|
||||
class MastodonAccountManager():
|
||||
"""Initialize the Mastodon account.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')
|
||||
|
|
|
|||
112
SentiTooter.py
112
SentiTooter.py
|
|
@ -1,4 +1,5 @@
|
|||
from germansentiment import SentimentModel
|
||||
from pandas import DataFrame
|
||||
import numpy as np
|
||||
from scipy.special import softmax
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
|
@ -9,7 +10,18 @@ import spacy
|
|||
from collections import Counter
|
||||
|
||||
# Preprocess text (username and link placeholders)
|
||||
def preprocess(text):
|
||||
def preprocess(text:str) -> str:
|
||||
"""Removes tags and urls from text.
|
||||
|
||||
Parameters
|
||||
------
|
||||
text: str
|
||||
The raw toot from Mastodon.
|
||||
Returns
|
||||
------
|
||||
str
|
||||
The cleaned text.
|
||||
"""
|
||||
new_text = []
|
||||
|
||||
for t in text.split(" "):
|
||||
|
|
@ -20,9 +32,12 @@ def preprocess(text):
|
|||
|
||||
|
||||
class SentiTooter:
|
||||
""""""
|
||||
"""Class to analyze the toots.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initilize the sentiment models and labels.
|
||||
"""
|
||||
self.deModel = SentimentModel()
|
||||
self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
|
||||
self.enModel, self.enTokenizer = self.initModel()
|
||||
|
|
@ -30,7 +45,20 @@ class SentiTooter:
|
|||
self.labels = ['negative', 'neutral', 'positive']
|
||||
self.sia = SentimentIntensityAnalyzer()
|
||||
|
||||
def analyze(self, language, content):
|
||||
def analyze(self, language:str, content:str) -> list[str, str, float]:
|
||||
"""Analyzes the sentiments of the toots.
|
||||
|
||||
Parameters
|
||||
------
|
||||
language: str
|
||||
The language tag of the toot.
|
||||
content: str
|
||||
The toot content.
|
||||
Returns
|
||||
------
|
||||
list[str, str, float]
|
||||
A list with the sentiment, analyzer type, and sentiment score.
|
||||
"""
|
||||
match language:
|
||||
case 'de':
|
||||
sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
|
||||
|
|
@ -61,6 +89,13 @@ class SentiTooter:
|
|||
|
||||
|
||||
def initModel(self):
|
||||
"""Initialize the english models.
|
||||
|
||||
Returns
|
||||
------
|
||||
tupel
|
||||
The pretrained model and tokenizer.
|
||||
"""
|
||||
# PT
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
|
||||
tokenizer.save_pretrained(self.enModelType)
|
||||
|
|
@ -68,7 +103,14 @@ class SentiTooter:
|
|||
model.save_pretrained(self.enModelType)
|
||||
return model, tokenizer
|
||||
|
||||
def translateToots(yesterdaysToots):
|
||||
def translateToots(yesterdaysToots:DataFrame) -> DataFrame:
|
||||
"""Translates all toots to english.
|
||||
|
||||
Returns
|
||||
------
|
||||
Dataframe
|
||||
Containing the english translated toots.
|
||||
"""
|
||||
yesterdaysTootsTranslated = yesterdaysToots
|
||||
for index, row in yesterdaysTootsTranslated.iterrows():
|
||||
if (row['language'] != 'en'):
|
||||
|
|
@ -79,11 +121,39 @@ def translateToots(yesterdaysToots):
|
|||
yesterdaysTootsTranslated.drop(index)
|
||||
return yesterdaysTootsTranslated
|
||||
|
||||
def translateToot(language, toot):
|
||||
def translateToot(language:str, toot:str) -> str:
|
||||
"""Translate a toot in english.
|
||||
|
||||
Parameters
|
||||
------
|
||||
language:str
|
||||
The language of the toot.
|
||||
toot: str
|
||||
The toot content.
|
||||
|
||||
Returns
|
||||
------
|
||||
str
|
||||
The in english translated toot.
|
||||
"""
|
||||
content = preprocess(toot)
|
||||
return GoogleTranslator(source=language, target='en').translate(content)
|
||||
|
||||
def countWords(concatedToots, count):
|
||||
def countWords(concatedToots: str, number: int) -> list:
|
||||
"""Counts the word frequencies in all toots of a given sentiment.
|
||||
|
||||
Parameters
|
||||
------
|
||||
concatedToots: str
|
||||
All toots from a sentiment.
|
||||
number: int
|
||||
Number of words to calculate word frequencies.
|
||||
|
||||
Returns
|
||||
------
|
||||
list
|
||||
List containing tuple of word and word frequency.
|
||||
"""
|
||||
nlp = spacy.load('en_core_web_md')
|
||||
doc = nlp(concatedToots)
|
||||
|
||||
|
|
@ -96,18 +166,30 @@ def countWords(concatedToots, count):
|
|||
|
||||
# five most common noun tokens
|
||||
noun_freq = Counter(nouns)
|
||||
return noun_freq.most_common(count)
|
||||
return noun_freq.most_common(number)
|
||||
|
||||
def createWordCountPerSentiment(translatedToots):
|
||||
def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str:
|
||||
"""Count all word frequencies of all toots per sentiment.
|
||||
|
||||
Paramters
|
||||
------
|
||||
translatedToots: DataFrame
|
||||
The dataframe with all toots in english.
|
||||
|
||||
Returns
|
||||
------
|
||||
str
|
||||
Containing words and wourd counts per sentiment.
|
||||
"""
|
||||
sentimentList = []
|
||||
for sentiment in ['positive', 'neutral', 'negative']:
|
||||
tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot
|
||||
wordCounts = countWords(tootsSeries.str.cat(sep=' '), 5)
|
||||
countList = []
|
||||
for count in wordCounts:
|
||||
countList.append(str(count[0]) + ' (' + str(count[1]) + ')')
|
||||
list2String = ', '.join(countList)
|
||||
wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5)
|
||||
FrequenciesList = []
|
||||
for Frequencies in wordFrequencies:
|
||||
FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')')
|
||||
list2String = ', '.join(FrequenciesList)
|
||||
sentimentString = sentiment + ': ' + list2String
|
||||
sentimentList.append(sentimentString)
|
||||
wordCountsPerSentiments = '\n'.join(sentimentList)
|
||||
return wordCountsPerSentiments
|
||||
wordFrequenciessPerSentiments = '\n'.join(sentimentList)
|
||||
return wordFrequenciessPerSentiments
|
||||
|
|
@ -1,3 +1,5 @@
|
|||
"""This script containing the table definitions for the database."""
|
||||
|
||||
from DbSetup import Base
|
||||
from sqlalchemy import Column, Date, Integer, Float, String
|
||||
|
||||
|
|
@ -14,8 +16,6 @@ class Toots(Base):
|
|||
userName = Column(String(255))
|
||||
userId = Column(String(255))
|
||||
|
||||
|
||||
|
||||
class SentimentCounts(Base):
|
||||
__tablename__ = 'SentimentCounts'
|
||||
__table_args__ = {'extend_existing': True}
|
||||
|
|
|
|||
|
|
@ -1,27 +1,87 @@
|
|||
from langdetect import detect
|
||||
import pytz
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import re
|
||||
from SentiTooter import SentiTooter
|
||||
from pprint import pprint
|
||||
|
||||
class TootCrawler():
|
||||
"""Class to fetch the recent toots from fedihum.org."""
|
||||
|
||||
def __init__(self, mastodonInstance) -> None:
|
||||
def __init__(self, mastodonInstance: any) -> None:
|
||||
"""Initialize the Mastodon instance and depending classes.
|
||||
|
||||
Parameters
|
||||
------
|
||||
mastodonInstance: any
|
||||
The initialized Mastodon instance.
|
||||
"""
|
||||
self.mastodonInstance = mastodonInstance
|
||||
self.compilePattern = re.compile('<.*?>')
|
||||
self.sentiTooter = SentiTooter()
|
||||
self.localTimezone = pytz.timezone('Europe/Berlin')
|
||||
|
||||
def getLocalTimeline(self, minId=None):
|
||||
def getLocalTimeline(self, minId=None) -> any:
|
||||
"""Receave the local timeline
|
||||
|
||||
Parameters
|
||||
------
|
||||
minId: str | None
|
||||
The last fetched toot id from the database.
|
||||
|
||||
Returns
|
||||
------
|
||||
any
|
||||
The local Mastodon timeline from fedihum.org.
|
||||
"""
|
||||
return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
|
||||
|
||||
def cleanhtml(self, raw_html):
|
||||
def cleanhtml(self, raw_html:str) -> str:
|
||||
"""remove brackets and http string from toots
|
||||
|
||||
Parameters
|
||||
------
|
||||
raw_html: str
|
||||
The toot content.
|
||||
Returns
|
||||
------
|
||||
str:
|
||||
The cleaned toot content.
|
||||
"""
|
||||
cleantext = re.sub(self.compilePattern, '', raw_html)
|
||||
cleantext = re.sub(r'http\S+', '', cleantext)
|
||||
return cleantext
|
||||
|
||||
def buildTootsDataframe(self, minId=None):
|
||||
def buildTootsDataframe(self, minId=None) -> DataFrame:
|
||||
"""Parse fetched toots from Mastodon to dataframe.
|
||||
|
||||
Parameters
|
||||
------
|
||||
minId: str | None
|
||||
The id of the last fetched toot.
|
||||
|
||||
Returns
|
||||
------
|
||||
DataFrame
|
||||
A Dataframe containing
|
||||
sentiment: str
|
||||
The sentiment (positive, neutral, negative)
|
||||
model: str
|
||||
The used sentiment model.
|
||||
toot: str
|
||||
The content of the toot.
|
||||
datetime: datetime
|
||||
The datetime of the toot.
|
||||
language: str
|
||||
The langage flag of the toot.
|
||||
userName: str.
|
||||
The user name of the toot.
|
||||
userId: str
|
||||
The user id.
|
||||
tootId: str
|
||||
The toot id.
|
||||
"""
|
||||
toots = []
|
||||
allTimelineResults = []
|
||||
timelinePagination = self.getLocalTimeline(minId)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue