diff --git a/.gitignore b/.gitignore index d946327..613ca0d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,12 @@ -database.db -plots -instance -__pycache__ -hedonodon_clientcred.secret -hedonodon_usercred.secret -.fleet -test.py -.idea -cardiffnlp -venv \ No newline at end of file +database.db +plots +instance +__pycache__ +hedonodon_clientcred.secret +hedonodon_usercred.secret +.fleet +test.py +.idea +cardiffnlp +venv +logs.txt diff --git a/CRUDManager.py b/CRUDManager.py index 0226fbf..cd0c7e0 100644 --- a/CRUDManager.py +++ b/CRUDManager.py @@ -1,10 +1,21 @@ -from DbSetup import engine, session, databaseUrl +from DbSetup import connection, engine, session, databaseUrl import pandas as pd -from sqlalchemy import desc, select +from sqlalchemy import desc, select, sql from Tables import Toots +from pandas.core.api import ( + DataFrame) def calculateSentimentCount(): + """Calculates the frequencies of the sentiments. + + Returns + ------- + DataFrame + Containing date (YY-MM-DD), sentiment (positive, neutral, negative), + and sentimentCount. + """ + query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount FROM Toots GROUP BY DATE(datetime), @@ -12,12 +23,23 @@ def calculateSentimentCount(): HAVING datetime >= DATE("now","-1 day") AND datetime < DATE("now")''' return pd.read_sql( - query, - databaseUrl, + sql.text(query), + connection, parse_dates=["datetime"] ) -def calculateSentimentMean(dataframe): +def calculateSentimentMean(dataframe:DataFrame) -> DataFrame: + """Calculates the mean of the sentiments. + + Parameters + ------- + dataframe: DataFrame + + Returns + ------- + Dataframe + Containing date (YY-MM-DD), sentimentsMean. + """ negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1 positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum() sentimentSum = dataframe['sentimentCount'].sum() @@ -32,17 +54,68 @@ def calculateSentimentMean(dataframe): ] ) -class CRUDManager(): +def getYesterdaysToots() -> DataFrame: + """Fetches yesterdays toots from database. - def saveToDatabase(self, dataframe, table:str, useIndex=False): + Returns + ------- + pd.Dataframe + Containing date (YY-MM-DD), language, sentiment, toot. + """ + query = f'''SELECT datetime as date, language, sentiment, toot + FROM Toots + WHERE datetime >= DATE("now","-1 day") + AND datetime < DATE("now")''' + return pd.read_sql( + sql.text(query), + connection, + parse_dates=["datetime"] + ) + +class CRUDManager(): + """Class for database operations""" + + def saveToDatabase(self, dataframe:DataFrame, table:str, useIndex=False): + """Saves dataframe to database. + + Parameters + ------- + dataframe: DataFrame + Input dataframe. + table: str + Table, where to save the data. + useIndex: boolean + Should the index of the dataframe be used as index for + the database table? + """ try: dataframe.to_sql(table, engine, index=useIndex, if_exists="append") except: print(f'Could not save data to {table}!') - def loadFromDatabase(self, table:str, indexColumn=None): - return pd.read_sql_table(table, databaseUrl, index_col=indexColumn) + def loadFromDatabase(self, table:str, indexColumn=None) -> DataFrame: + """Load a table into a dataframe. - def getLastToot(self): + Parameters + ------- + table: str + Table, where to save the data. + indexColumn: str | None + Should the index of the table be used as index for + the dataframe? + Returns + ------- + DataFrame + """ + return pd.read_sql_table(table, connection, index_col=indexColumn) + + def getLastToot(self) -> str: + """Query the last toot id from database. + + Results + ------- + str + A toot id. + """ stmt = select(Toots.tootId).order_by(desc('datetime')) - return session.scalars(stmt).first() \ No newline at end of file + return session.scalars(stmt).first() diff --git a/DbSetup.py b/DbSetup.py index 1898556..b6179f7 100644 --- a/DbSetup.py +++ b/DbSetup.py @@ -1,11 +1,18 @@ -from sqlalchemy import create_engine -from sqlalchemy.orm import Session -from sqlalchemy.ext.declarative import declarative_base - -databaseUrl = 'sqlite:///database.db' -engine = create_engine(databaseUrl, future=True) -session = Session(engine) -Base = declarative_base() - -def init_db(): - Base.metadata.create_all(bind=engine) +"""Script to initialize the database. + Serves database url, engine, connection and session. +""" + +from sqlalchemy import create_engine +from sqlalchemy.orm import Session +from sqlalchemy.ext.declarative import declarative_base + +databaseUrl = 'sqlite:///database.db' +engine = create_engine(databaseUrl, future=True) +connection = engine.connect() +session = Session(engine) +Base = declarative_base() + +def init_db(): + """Initialize the database. + """ + Base.metadata.create_all(bind=engine) diff --git a/Main.py b/Main.py index a64a63c..5bb6532 100644 --- a/Main.py +++ b/Main.py @@ -1,4 +1,20 @@ -from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean +""" +Hedonodon toot sentiment analyzer. + +This programm fetches toots from the fedihum.org Mastodon instance, calculates +the frequencies of the sentiments (positive, neutral, negative) and the mean +from these nominal values (even this is not statistical correct (;-_-)!, but +not all analyzer return compounds). +It also calculates the word count of the nouns per sentiment. + +It uses germansentiment for german toots, twitter-roberta-base-sentiment for +english toots, and vaderSentiment for other languages. + +For the word counts I translate the toots to english with the GoogleTranslator +first. +""" + +from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots from datetime import datetime, date from DbSetup import init_db import locale @@ -6,10 +22,12 @@ from MastodonAccountManager import MastodonAccountManager import matplotlib.pyplot as plt import matplotlib.dates as mdates from TootCrawler import TootCrawler +from SentiTooter import translateToots, createWordFrequenciesPerSentiment -locale.setlocale(locale.LC_TIME, "en_EN.UTF-8") +locale.setlocale(locale.LC_TIME, "en_US.UTF-8") init_db() +print('Initialize Mastodon...') mastodonAccountManager = MastodonAccountManager() mastodonInstance = mastodonAccountManager.instance """ @@ -19,27 +37,47 @@ mastodonInstance.log_in( to_file = 'hedonodon_usercred.secret' ) """ +print('done!') +print('Fetching recent toots...') tootCrawler = TootCrawler(mastodonInstance) crudManager = CRUDManager() - lastTootId = crudManager.getLastToot() tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId) -exit() +print('done!') + +print('Save toots to database...') if not tootsDataframe.empty: crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False) + print('done!') else: - print('Nothing changed since last database insert!') + print('nothing changed since last database insert!') +print('Calculate word counts...') +yesterdaysToots = getYesterdaysToots() +translatedToots = translateToots(yesterdaysToots) +wordCountsPerSentiment = createWordFrequenciesPerSentiment(translatedToots) +print('done!') + +print(wordCountsPerSentiment); + +print('Calculate sentiment counts...') sentimentsYesterday = calculateSentimentCount() -sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday) +print('done!') +print('Calculate sentiment mean...') +sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday) +print('done!') + +print('Save calculations to database...') if not tootsDataframe.empty: crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True) crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True) + print('done!') else: - print('Nothing changed since last database insert!') + print('nothing changed since last database insert!') +print('Create figure...') colormap = { 'negative': '#ff9999', 'neutral': '#ffcc99', @@ -80,7 +118,7 @@ axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9)) # Line chart. lineChart = dataframe4LineChart.plot.line( ax=axes[1], - title='Mean of all sentiments from max positive (1) to min negative (-1)' + title='"Mean" of sentiments, calculated from nominal values, pos(1), neu (0), neg (-1)!' ) axes[1].grid(True) axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)]) @@ -92,7 +130,9 @@ axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h')) axes[1].tick_params(which='minor', length=0) plotFileUrl = f'./plots/{TodayDate}.png' plt.savefig(plotFileUrl) +print('done!') +print('Send toot...') #media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.") -#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en') - +#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en') +print('done!') diff --git a/MastodonAccountManager.py b/MastodonAccountManager.py index 9c51e54..957ac4e 100644 --- a/MastodonAccountManager.py +++ b/MastodonAccountManager.py @@ -1,5 +1,7 @@ -from mastodon import Mastodon - -class MastodonAccountManager(): - def __init__(self): - self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret') +from mastodon import Mastodon + +class MastodonAccountManager(): + """Initialize the Mastodon account. + """ + def __init__(self): + self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret') diff --git a/README.md b/README.md index fdbe2f7..58d02b6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,19 @@ -# Hedonodon -I'm using [vaderSentiment](https://pypi.org/project/vaderSentiment/) to calculate the compounds. - -More Documentation coming soon! \ No newline at end of file +# Hedonodon +## Prerequisites +Install the dependencies with `python -m pip install -r requirements.txt`. +Install SpaCys nlp model with `python -m spacy download en_core_web_lg`. +If the automatic download of the twitter-roberta-base-sentiment model and tokenizer fail, go to the model pages on hugging face (see models section) and download the to the respective folder (cardiffnlp/twitter-roberta-base-sentiment). + +## Purpose +Hedonodon fetch toots from fedihum.org and calculates the sentiments, sentiment mean and word frequencies of each day, and creates fancy diagrams from the data. + +## Motivation +This tool was created to understand how sentiment analyses and nlp methods works, so it may lacks of proper use of models etc... + +## Models +It uses "germansentiment"](https://huggingface.co/oliverguhr/german-sentiment-bert) for german toots, []"twitter-roberta-base-sentiment"](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment) for +english toots, and ["vaderSentiment"](https://pypi.org/project/vaderSentiment/) for other languages. +For the word counts I translate the toots to english with the GoogleTranslator from [deep_translater](https://pypi.org/project/deep-translator/) first and then use SpaCys nlp model ["en_core_web_lg"](https://spacy.io/models/en/) to calculate the word frequencies. + +## Weaknesses +Since some moduls do not return sentiment compounds I have to use the nominal sentiment values (positive, neutral, negative) to calculate the mean of the day, which is statisticaly not okay (;-_-). \ No newline at end of file diff --git a/SentiTooter.py b/SentiTooter.py index 9626078..aabc498 100644 --- a/SentiTooter.py +++ b/SentiTooter.py @@ -1,26 +1,43 @@ from germansentiment import SentimentModel +from pandas import DataFrame import numpy as np from scipy.special import softmax from transformers import AutoModelForSequenceClassification from transformers import AutoTokenizer from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer - +from deep_translator import GoogleTranslator +import spacy +from collections import Counter # Preprocess text (username and link placeholders) -def preprocess(text): +def preprocess(text:str) -> str: + """Removes tags and urls from text. + + Parameters + ------ + text: str + The raw toot from Mastodon. + Returns + ------ + str + The cleaned text. + """ new_text = [] for t in text.split(" "): - t = '@user' if t.startswith('@') and len(t) > 1 else t - t = 'http' if t.startswith('http') else t + t = '' if t.startswith('@') and len(t) > 1 else t + t = '' if t.startswith('http') else t new_text.append(t) return " ".join(new_text) class SentiTooter: - """""" + """Class to analyze the toots. + """ def __init__(self): + """Initilize the sentiment models and labels. + """ self.deModel = SentimentModel() self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment" self.enModel, self.enTokenizer = self.initModel() @@ -28,7 +45,20 @@ class SentiTooter: self.labels = ['negative', 'neutral', 'positive'] self.sia = SentimentIntensityAnalyzer() - def analyze(self, language, content): + def analyze(self, language:str, content:str) -> list[str, str, float]: + """Analyzes the sentiments of the toots. + + Parameters + ------ + language: str + The language tag of the toot. + content: str + The toot content. + Returns + ------ + list[str, str, float] + A list with the sentiment, analyzer type, and sentiment score. + """ match language: case 'de': sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True) @@ -41,15 +71,15 @@ class SentiTooter: output = self.enModel(**encoded_input) scores = output[0][0].detach().numpy() scores = softmax(scores) - print(scores) + #print(scores) sentimentIndexWithMaxScore = np.argmax(scores) sentimentLabel = self.labels[sentimentIndexWithMaxScore] sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment', max(scores)] - print(sentiment) + #print(sentiment) return sentiment case _: compound = self.sia.polarity_scores(content)['compound'] - print(self.sia.polarity_scores(content), 'vaderSentiment') + #print(self.sia.polarity_scores(content), 'vaderSentiment') if compound > (1 / 3): return ['positive', 'vaderSentiment'] elif compound < (-1 / 3): @@ -58,8 +88,14 @@ class SentiTooter: return ['neutral', 'vaderSentiment'] - def initModel(self): + """Initialize the english models. + + Returns + ------ + tupel + The pretrained model and tokenizer. + """ # PT tokenizer = AutoTokenizer.from_pretrained(self.enModelType) tokenizer.save_pretrained(self.enModelType) @@ -67,12 +103,93 @@ class SentiTooter: model.save_pretrained(self.enModelType) return model, tokenizer - # # TF - # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL) - # model.save_pretrained(MODEL) +def translateToots(yesterdaysToots:DataFrame) -> DataFrame: + """Translates all toots to english. - # text = "Good night 😊" - # encoded_input = tokenizer(text, return_tensors='tf') - # output = model(encoded_input) - # scores = output[0][0].numpy() - # scores = softmax(scores) + Returns + ------ + Dataframe + Containing the english translated toots. + """ + yesterdaysTootsTranslated = yesterdaysToots + for index, row in yesterdaysTootsTranslated.iterrows(): + if (row['language'] != 'en'): + try: + yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot']) + yesterdaysTootsTranslated.at[index,'language'] = 'en' + except: + yesterdaysTootsTranslated.drop(index) + return yesterdaysTootsTranslated + +def translateToot(language:str, toot:str) -> str: + """Translate a toot in english. + + Parameters + ------ + language:str + The language of the toot. + toot: str + The toot content. + + Returns + ------ + str + The in english translated toot. + """ + content = preprocess(toot) + return GoogleTranslator(source=language, target='en').translate(content) + +def countWords(concatedToots: str, number: int) -> list: + """Counts the word frequencies in all toots of a given sentiment. + + Parameters + ------ + concatedToots: str + All toots from a sentiment. + number: int + Number of words to calculate word frequencies. + + Returns + ------ + list + List containing tuple of word and word frequency. + """ + nlp = spacy.load('en_core_web_lg') + doc = nlp(concatedToots) + + # noun tokens that arent stop words or punctuations + nouns = [token.text + for token in doc + if (not token.is_stop and + not token.is_punct and + token.pos_ == "NOUN")] + + # five most common noun tokens + noun_freq = Counter(nouns) + return noun_freq.most_common(number) + +def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str: + """Count all word frequencies of all toots per sentiment. + + Paramters + ------ + translatedToots: DataFrame + The dataframe with all toots in english. + + Returns + ------ + str + Containing words and wourd counts per sentiment. + """ + sentimentList = [] + for sentiment in ['positive', 'neutral', 'negative']: + tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot + wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5) + FrequenciesList = [] + for Frequencies in wordFrequencies: + FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')') + list2String = ', '.join(FrequenciesList) + sentimentString = sentiment + ': ' + list2String + sentimentList.append(sentimentString) + wordFrequenciessPerSentiments = '\n'.join(sentimentList) + return wordFrequenciessPerSentiments \ No newline at end of file diff --git a/Tables.py b/Tables.py index 78aa412..071179a 100644 --- a/Tables.py +++ b/Tables.py @@ -1,32 +1,32 @@ -from DbSetup import Base -from sqlalchemy import Column, Date, Integer, Float, String - -class Toots(Base): - __tablename__ = 'Toots' - __table_args__ = {'extend_existing': True} - index = Column(Integer, primary_key=True) - model = Column(String(30)) - datetime = Column(Date) - language = Column(String(3)) - sentiment = Column(String(8)) - tootId = Column(String(255)) - toot = Column(String(600)) - userName = Column(String(255)) - userId = Column(String(255)) - - - -class SentimentCounts(Base): - __tablename__ = 'SentimentCounts' - __table_args__ = {'extend_existing': True} - index = Column(Integer, primary_key=True) - sentimentCount = Column(Integer) - date = Column(Date, primary_key=True) - sentiment = Column(String(8)) - -class SentimentMeans(Base): - __tablename__ = 'SentimentMeans' - __table_args__ = {'extend_existing': True} - index = Column(Integer, primary_key=True) - date = Column(Date, primary_key=True) +"""This script containing the table definitions for the database.""" + +from DbSetup import Base +from sqlalchemy import Column, Date, Integer, Float, String + +class Toots(Base): + __tablename__ = 'Toots' + __table_args__ = {'extend_existing': True} + index = Column(Integer, primary_key=True) + model = Column(String(30)) + datetime = Column(Date) + language = Column(String(3)) + sentiment = Column(String(8)) + tootId = Column(String(255)) + toot = Column(String(600)) + userName = Column(String(255)) + userId = Column(String(255)) + +class SentimentCounts(Base): + __tablename__ = 'SentimentCounts' + __table_args__ = {'extend_existing': True} + index = Column(Integer, primary_key=True) + sentimentCount = Column(Integer) + date = Column(Date, primary_key=True) + sentiment = Column(String(8)) + +class SentimentMeans(Base): + __tablename__ = 'SentimentMeans' + __table_args__ = {'extend_existing': True} + index = Column(Integer, primary_key=True) + date = Column(Date, primary_key=True) SentimentsMean = Column(Float) \ No newline at end of file diff --git a/TootCrawler.py b/TootCrawler.py index 1b081c2..67cd722 100644 --- a/TootCrawler.py +++ b/TootCrawler.py @@ -1,48 +1,111 @@ -from langdetect import detect -import pytz -import pandas as pd -import re -from SentiTooter import SentiTooter -from pprint import pprint - -class TootCrawler(): - - def __init__(self, mastodonInstance) -> None: - self.mastodonInstance = mastodonInstance - self.compilePattern = re.compile('<.*?>') - self.sentiTooter = SentiTooter() - self.localTimezone = pytz.timezone('Europe/Berlin') - - def getLocalTimeline(self, minId=None): - return self.mastodonInstance.timeline_local(min_id=minId, limit=500) - - def cleanhtml(self, raw_html): - cleantext = re.sub(self.compilePattern, '', raw_html) - cleantext = re.sub(r'http\S+', '', cleantext) - return cleantext - - def buildTootsDataframe(self, minId=None): - toots = [] - allTimelineResults = [] - timelinePagination = self.getLocalTimeline(minId) - - while timelinePagination: - allTimelineResults = allTimelineResults + timelinePagination - timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination) - for i in allTimelineResults: - content = self.cleanhtml(i.content) - language = detect(content) - sentiment = self.sentiTooter.analyze(language, content) - toot = { - "sentiment": sentiment[0], - "model": sentiment[1], - "toot": content, - "datetime": i.created_at.astimezone(self.localTimezone), - "language": language, - "userName": i.account.display_name, - "userId": i.account.id, - "tootId": i.id - } - toots.append(toot) - toots.sort(key=lambda item:item.get('datetime')) +from langdetect import detect +import pytz +import pandas as pd +from pandas import DataFrame +import re +from SentiTooter import SentiTooter +from pprint import pprint + +class TootCrawler(): + """Class to fetch the recent toots from fedihum.org.""" + + def __init__(self, mastodonInstance: any) -> None: + """Initialize the Mastodon instance and depending classes. + + Parameters + ------ + mastodonInstance: any + The initialized Mastodon instance. + """ + self.mastodonInstance = mastodonInstance + self.compilePattern = re.compile('<.*?>') + self.sentiTooter = SentiTooter() + self.localTimezone = pytz.timezone('Europe/Berlin') + + def getLocalTimeline(self, minId=None) -> any: + """Receave the local timeline + + Parameters + ------ + minId: str | None + The last fetched toot id from the database. + + Returns + ------ + any + The local Mastodon timeline from fedihum.org. + """ + return self.mastodonInstance.timeline_local(min_id=minId, limit=500) + + def cleanhtml(self, raw_html:str) -> str: + """remove brackets and http string from toots + + Parameters + ------ + raw_html: str + The toot content. + Returns + ------ + str: + The cleaned toot content. + """ + cleantext = re.sub(self.compilePattern, '', raw_html) + cleantext = re.sub(r'http\S+', '', cleantext) + return cleantext + + def buildTootsDataframe(self, minId=None) -> DataFrame: + """Parse fetched toots from Mastodon to dataframe. + + Parameters + ------ + minId: str | None + The id of the last fetched toot. + + Returns + ------ + DataFrame + A Dataframe containing + sentiment: str + The sentiment (positive, neutral, negative) + model: str + The used sentiment model. + toot: str + The content of the toot. + datetime: datetime + The datetime of the toot. + language: str + The langage flag of the toot. + userName: str. + The user name of the toot. + userId: str + The user id. + tootId: str + The toot id. + """ + toots = [] + allTimelineResults = [] + timelinePagination = self.getLocalTimeline(minId) + + while timelinePagination: + allTimelineResults = allTimelineResults + timelinePagination + timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination) + for i in allTimelineResults: + content = self.cleanhtml(i.content) + try: + language = detect(content) + except: + language = None + sentiment = self.sentiTooter.analyze(language, content) + toot = { + "sentiment": sentiment[0], + "model": sentiment[1], + "toot": content, + "datetime": i.created_at.astimezone(self.localTimezone), + "language": language, + "userName": i.account.display_name, + "userId": i.account.id, + "tootId": i.id + } + toots.append(toot) + toots.sort(key=lambda item:item.get('datetime')) return pd.DataFrame.from_records(toots) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 842b542..bc6906e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,12 @@ matplotlib pandas sqlalchemy vader-multi +langdetect numpy pytz -transformers \ No newline at end of file +transformers +wheel +germansentiment +scipy +deep_translator +spacy \ No newline at end of file