From 2b98565444e6a8e9ead4389941f7ec391566b5e2 Mon Sep 17 00:00:00 2001 From: Robert Nasarek Date: Fri, 27 Jan 2023 21:08:25 +0100 Subject: [PATCH 01/10] made hedonodon server ready --- .gitignore | 23 ++--- CRUDManager.py | 96 +++++++++---------- DbSetup.py | 23 ++--- Main.py | 196 +++++++++++++++++++------------------- MastodonAccountManager.py | 10 +- README.md | 6 +- SentiTooter.py | 148 ++++++++++++++-------------- Tables.py | 62 ++++++------ TootCrawler.py | 94 +++++++++--------- requirements.txt | 20 ++-- 10 files changed, 342 insertions(+), 336 deletions(-) diff --git a/.gitignore b/.gitignore index d946327..613ca0d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,12 @@ -database.db -plots -instance -__pycache__ -hedonodon_clientcred.secret -hedonodon_usercred.secret -.fleet -test.py -.idea -cardiffnlp -venv \ No newline at end of file +database.db +plots +instance +__pycache__ +hedonodon_clientcred.secret +hedonodon_usercred.secret +.fleet +test.py +.idea +cardiffnlp +venv +logs.txt diff --git a/CRUDManager.py b/CRUDManager.py index 0226fbf..4f05098 100644 --- a/CRUDManager.py +++ b/CRUDManager.py @@ -1,48 +1,48 @@ -from DbSetup import engine, session, databaseUrl -import pandas as pd -from sqlalchemy import desc, select -from Tables import Toots - - -def calculateSentimentCount(): - query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount - FROM Toots - GROUP BY DATE(datetime), - sentiment - HAVING datetime >= DATE("now","-1 day") - AND datetime < DATE("now")''' - return pd.read_sql( - query, - databaseUrl, - parse_dates=["datetime"] - ) - -def calculateSentimentMean(dataframe): - negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1 - positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum() - sentimentSum = dataframe['sentimentCount'].sum() - sentimentMean = (negativeSentimentSum + positiveSentimentSum) / sentimentSum - sentimentDate = dataframe.loc[0]['date'] - return pd.DataFrame.from_records( - [ - { - 'date': sentimentDate, - 'sentimentsMean': sentimentMean - } - ] - ) - -class CRUDManager(): - - def saveToDatabase(self, dataframe, table:str, useIndex=False): - try: - dataframe.to_sql(table, engine, index=useIndex, if_exists="append") - except: - print(f'Could not save data to {table}!') - - def loadFromDatabase(self, table:str, indexColumn=None): - return pd.read_sql_table(table, databaseUrl, index_col=indexColumn) - - def getLastToot(self): - stmt = select(Toots.tootId).order_by(desc('datetime')) - return session.scalars(stmt).first() \ No newline at end of file +from DbSetup import connection, engine, session, databaseUrl +import pandas as pd +from sqlalchemy import desc, select, sql +from Tables import Toots + + +def calculateSentimentCount(): + query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount + FROM Toots + GROUP BY DATE(datetime), + sentiment + HAVING datetime >= DATE("now","-1 day") + AND datetime < DATE("now")''' + return pd.read_sql( + sql.text(query), + connection, + parse_dates=["datetime"] + ) + +def calculateSentimentMean(dataframe): + negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1 + positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum() + sentimentSum = dataframe['sentimentCount'].sum() + sentimentMean = (negativeSentimentSum + positiveSentimentSum) / sentimentSum + sentimentDate = dataframe.loc[0]['date'] + return pd.DataFrame.from_records( + [ + { + 'date': sentimentDate, + 'sentimentsMean': sentimentMean + } + ] + ) + +class CRUDManager(): + + def saveToDatabase(self, dataframe, table:str, useIndex=False): + try: + dataframe.to_sql(table, engine, index=useIndex, if_exists="append") + except: + print(f'Could not save data to {table}!') + + def loadFromDatabase(self, table:str, indexColumn=None): + return pd.read_sql_table(table, connection, index_col=indexColumn) + + def getLastToot(self): + stmt = select(Toots.tootId).order_by(desc('datetime')) + return session.scalars(stmt).first() diff --git a/DbSetup.py b/DbSetup.py index 1898556..c787928 100644 --- a/DbSetup.py +++ b/DbSetup.py @@ -1,11 +1,12 @@ -from sqlalchemy import create_engine -from sqlalchemy.orm import Session -from sqlalchemy.ext.declarative import declarative_base - -databaseUrl = 'sqlite:///database.db' -engine = create_engine(databaseUrl, future=True) -session = Session(engine) -Base = declarative_base() - -def init_db(): - Base.metadata.create_all(bind=engine) +from sqlalchemy import create_engine +from sqlalchemy.orm import Session +from sqlalchemy.ext.declarative import declarative_base + +databaseUrl = 'sqlite:///database.db' +engine = create_engine(databaseUrl, future=True) +connection = engine.connect() +session = Session(engine) +Base = declarative_base() + +def init_db(): + Base.metadata.create_all(bind=engine) diff --git a/Main.py b/Main.py index b80b3a6..2b6768a 100644 --- a/Main.py +++ b/Main.py @@ -1,98 +1,98 @@ -from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean -from datetime import datetime, date -from DbSetup import init_db -import locale -from MastodonAccountManager import MastodonAccountManager -import matplotlib.pyplot as plt -import matplotlib.dates as mdates -from TootCrawler import TootCrawler - -locale.setlocale(locale.LC_TIME, "en_EN.UTF-8") -init_db() - -mastodonAccountManager = MastodonAccountManager() -mastodonInstance = mastodonAccountManager.instance -""" -mastodonInstance.log_in( - 'USER-EMAIL', - 'PW', - to_file = 'hedonodon_usercred.secret' -) -""" - -tootCrawler = TootCrawler(mastodonInstance) -crudManager = CRUDManager() - -lastTootId = crudManager.getLastToot() -tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId) - -if not tootsDataframe.empty: - crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False) -else: - print('Nothing changed since last database insert!') - -sentimentsYesterday = calculateSentimentCount() -sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday) - -if not tootsDataframe.empty: - crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True) - crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True) -else: - print('Nothing changed since last database insert!') - -colormap = { - 'negative': '#ff9999', - 'neutral': '#ffcc99', - "positive": '#99ff99' -} - -todaysColors = [] -for sentiment in sentimentsYesterday['sentiment'].to_numpy(): - todaysColors.append(colormap[sentiment]) - - - -TodayDate = datetime.strptime(sentimentsYesterday['date'][0], '%Y-%m-%d').strftime('%d.%m.%Y') -dataframe4PieChart = sentimentsYesterday.drop('date', axis=1).set_index('sentiment') -dataframe4LineChart = crudManager.loadFromDatabase('SentimentMeans', 'date').drop('index', axis=1) - -fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10)) - -# Pie chart. -pieChartlabels = dataframe4PieChart.index.to_numpy() -pieChart = dataframe4PieChart.plot.pie( - ax=axes[0], - y='sentimentCount', - ylabel="", - labels=dataframe4PieChart['sentimentCount'], - title=f'Moods of the toots on {TodayDate} of the local timeline on fedihum.org', - colors=todaysColors, - wedgeprops=dict(linewidth=3, edgecolor='w'), - startangle=90 -) - -axes[0].axis('equal') -centre_circle = plt.Circle((0, 0), 0.6, fc='white') -axes[0].add_patch(centre_circle) -chartBox = axes[0].get_position() -axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9)) - -# Line chart. -lineChart = dataframe4LineChart.plot.line( - ax=axes[1], - title='Mean of all sentiments from max positive (1) to min negative (-1)' -) -axes[1].grid(True) -axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)]) -axes[1].set_ylim([-1, 1]) -axes[1].xaxis.set_major_locator(mdates.MonthLocator()) -axes[1].xaxis.set_minor_locator(mdates.MonthLocator(bymonthday=15)) -axes[1].xaxis.set_major_formatter(plt.NullFormatter()) -axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h')) -axes[1].tick_params(which='minor', length=0) -plotFileUrl = f'./plots/{TodayDate}.png' -plt.savefig(plotFileUrl) - -media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.") -mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en') - +from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean +from datetime import datetime, date +from DbSetup import init_db +import locale +from MastodonAccountManager import MastodonAccountManager +import matplotlib.pyplot as plt +import matplotlib.dates as mdates +from TootCrawler import TootCrawler + +locale.setlocale(locale.LC_TIME, "en_US.UTF-8") +init_db() + +mastodonAccountManager = MastodonAccountManager() +mastodonInstance = mastodonAccountManager.instance +""" +mastodonInstance.log_in( + 'USER-EMAIL', + 'PW', + to_file = 'hedonodon_usercred.secret' +) +""" + +tootCrawler = TootCrawler(mastodonInstance) +crudManager = CRUDManager() + +lastTootId = crudManager.getLastToot() +tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId) + +if not tootsDataframe.empty: + crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False) +else: + print('Nothing changed since last database insert!') + +sentimentsYesterday = calculateSentimentCount() +sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday) + +if not tootsDataframe.empty: + crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True) + crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True) +else: + print('Nothing changed since last database insert!') + +colormap = { + 'negative': '#ff9999', + 'neutral': '#ffcc99', + "positive": '#99ff99' +} + +todaysColors = [] +for sentiment in sentimentsYesterday['sentiment'].to_numpy(): + todaysColors.append(colormap[sentiment]) + + + +TodayDate = datetime.strptime(sentimentsYesterday['date'][0], '%Y-%m-%d').strftime('%d.%m.%Y') +dataframe4PieChart = sentimentsYesterday.drop('date', axis=1).set_index('sentiment') +dataframe4LineChart = crudManager.loadFromDatabase('SentimentMeans', 'date').drop('index', axis=1) + +fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10)) + +# Pie chart. +pieChartlabels = dataframe4PieChart.index.to_numpy() +pieChart = dataframe4PieChart.plot.pie( + ax=axes[0], + y='sentimentCount', + ylabel="", + labels=dataframe4PieChart['sentimentCount'], + title=f'Moods of the toots on {TodayDate} of the local timeline on fedihum.org', + colors=todaysColors, + wedgeprops=dict(linewidth=3, edgecolor='w'), + startangle=90 +) + +axes[0].axis('equal') +centre_circle = plt.Circle((0, 0), 0.6, fc='white') +axes[0].add_patch(centre_circle) +chartBox = axes[0].get_position() +axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9)) + +# Line chart. +lineChart = dataframe4LineChart.plot.line( + ax=axes[1], + title='Mean of all sentiments from max positive (1) to min negative (-1)' +) +axes[1].grid(True) +axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)]) +axes[1].set_ylim([-1, 1]) +axes[1].xaxis.set_major_locator(mdates.MonthLocator()) +axes[1].xaxis.set_minor_locator(mdates.MonthLocator(bymonthday=15)) +axes[1].xaxis.set_major_formatter(plt.NullFormatter()) +axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h')) +axes[1].tick_params(which='minor', length=0) +plotFileUrl = f'./plots/{TodayDate}.png' +plt.savefig(plotFileUrl) + +media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.") +mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en') + diff --git a/MastodonAccountManager.py b/MastodonAccountManager.py index 9c51e54..68d62bb 100644 --- a/MastodonAccountManager.py +++ b/MastodonAccountManager.py @@ -1,5 +1,5 @@ -from mastodon import Mastodon - -class MastodonAccountManager(): - def __init__(self): - self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret') +from mastodon import Mastodon + +class MastodonAccountManager(): + def __init__(self): + self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret') diff --git a/README.md b/README.md index fdbe2f7..5440dc1 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Hedonodon -I'm using [vaderSentiment](https://pypi.org/project/vaderSentiment/) to calculate the compounds. - +# Hedonodon +I'm using [vaderSentiment](https://pypi.org/project/vaderSentiment/) to calculate the compounds. + More Documentation coming soon! \ No newline at end of file diff --git a/SentiTooter.py b/SentiTooter.py index 26b7a47..00899c2 100644 --- a/SentiTooter.py +++ b/SentiTooter.py @@ -1,74 +1,74 @@ -from germansentiment import SentimentModel -import numpy as np -from scipy.special import softmax -from transformers import AutoModelForSequenceClassification -from transformers import AutoTokenizer -from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer - - -# Preprocess text (username and link placeholders) -def preprocess(text): - new_text = [] - - for t in text.split(" "): - t = '@user' if t.startswith('@') and len(t) > 1 else t - t = 'http' if t.startswith('http') else t - new_text.append(t) - return " ".join(new_text) - - -class SentiTooter: - """""" - - def __init__(self): - self.deModel = SentimentModel() - self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment" - self.enModel, self.enTokenizer = self.initModel() - # https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt - self.labels = ['negative', 'neutral', 'positive'] - self.sia = SentimentIntensityAnalyzer() - - def analyze(self, language, content): - match language: - case 'de': - sentiment = self.deModel.predict_sentiment([content]) - sentiment.append('germanSentiment') - return sentiment - case 'en': - text = preprocess(content) - encoded_input = self.enTokenizer(text, return_tensors='pt') - output = self.enModel(**encoded_input) - scores = output[0][0].detach().numpy() - scores = softmax(scores) - sentimentIndexWithMaxScore = np.argmax(scores) - sentimentLabel = self.labels[sentimentIndexWithMaxScore] - sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment'] - return sentiment - case _: - compound = self.sia.polarity_scores(content)['compound'] - if compound > (1 / 3): - return ['positive', 'vaderSentiment'] - elif compound < (-1 / 3): - return ['negative', 'vaderSentiment'] - else: - return ['neutral', 'vaderSentiment'] - - - - def initModel(self): - # PT - tokenizer = AutoTokenizer.from_pretrained(self.enModelType) - tokenizer.save_pretrained(self.enModelType) - model = AutoModelForSequenceClassification.from_pretrained(self.enModelType) - model.save_pretrained(self.enModelType) - return model, tokenizer - - # # TF - # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL) - # model.save_pretrained(MODEL) - - # text = "Good night 😊" - # encoded_input = tokenizer(text, return_tensors='tf') - # output = model(encoded_input) - # scores = output[0][0].numpy() - # scores = softmax(scores) +from germansentiment import SentimentModel +import numpy as np +from scipy.special import softmax +from transformers import AutoModelForSequenceClassification +from transformers import AutoTokenizer +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer + + +# Preprocess text (username and link placeholders) +def preprocess(text): + new_text = [] + + for t in text.split(" "): + t = '@user' if t.startswith('@') and len(t) > 1 else t + t = 'http' if t.startswith('http') else t + new_text.append(t) + return " ".join(new_text) + + +class SentiTooter: + """""" + + def __init__(self): + self.deModel = SentimentModel() + self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment" + self.enModel, self.enTokenizer = self.initModel() + # https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt + self.labels = ['negative', 'neutral', 'positive'] + self.sia = SentimentIntensityAnalyzer() + + def analyze(self, language, content): + match language: + case 'de': + sentiment = self.deModel.predict_sentiment([content]) + sentiment.append('germanSentiment') + return sentiment + case 'en': + text = preprocess(content) + encoded_input = self.enTokenizer(text, return_tensors='pt') + output = self.enModel(**encoded_input) + scores = output[0][0].detach().numpy() + scores = softmax(scores) + sentimentIndexWithMaxScore = np.argmax(scores) + sentimentLabel = self.labels[sentimentIndexWithMaxScore] + sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment'] + return sentiment + case _: + compound = self.sia.polarity_scores(content)['compound'] + if compound > (1 / 3): + return ['positive', 'vaderSentiment'] + elif compound < (-1 / 3): + return ['negative', 'vaderSentiment'] + else: + return ['neutral', 'vaderSentiment'] + + + + def initModel(self): + # PT + tokenizer = AutoTokenizer.from_pretrained(self.enModelType) + tokenizer.save_pretrained(self.enModelType) + model = AutoModelForSequenceClassification.from_pretrained(self.enModelType) + model.save_pretrained(self.enModelType) + return model, tokenizer + + # # TF + # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL) + # model.save_pretrained(MODEL) + + # text = "Good night 😊" + # encoded_input = tokenizer(text, return_tensors='tf') + # output = model(encoded_input) + # scores = output[0][0].numpy() + # scores = softmax(scores) diff --git a/Tables.py b/Tables.py index 78aa412..c64178a 100644 --- a/Tables.py +++ b/Tables.py @@ -1,32 +1,32 @@ -from DbSetup import Base -from sqlalchemy import Column, Date, Integer, Float, String - -class Toots(Base): - __tablename__ = 'Toots' - __table_args__ = {'extend_existing': True} - index = Column(Integer, primary_key=True) - model = Column(String(30)) - datetime = Column(Date) - language = Column(String(3)) - sentiment = Column(String(8)) - tootId = Column(String(255)) - toot = Column(String(600)) - userName = Column(String(255)) - userId = Column(String(255)) - - - -class SentimentCounts(Base): - __tablename__ = 'SentimentCounts' - __table_args__ = {'extend_existing': True} - index = Column(Integer, primary_key=True) - sentimentCount = Column(Integer) - date = Column(Date, primary_key=True) - sentiment = Column(String(8)) - -class SentimentMeans(Base): - __tablename__ = 'SentimentMeans' - __table_args__ = {'extend_existing': True} - index = Column(Integer, primary_key=True) - date = Column(Date, primary_key=True) +from DbSetup import Base +from sqlalchemy import Column, Date, Integer, Float, String + +class Toots(Base): + __tablename__ = 'Toots' + __table_args__ = {'extend_existing': True} + index = Column(Integer, primary_key=True) + model = Column(String(30)) + datetime = Column(Date) + language = Column(String(3)) + sentiment = Column(String(8)) + tootId = Column(String(255)) + toot = Column(String(600)) + userName = Column(String(255)) + userId = Column(String(255)) + + + +class SentimentCounts(Base): + __tablename__ = 'SentimentCounts' + __table_args__ = {'extend_existing': True} + index = Column(Integer, primary_key=True) + sentimentCount = Column(Integer) + date = Column(Date, primary_key=True) + sentiment = Column(String(8)) + +class SentimentMeans(Base): + __tablename__ = 'SentimentMeans' + __table_args__ = {'extend_existing': True} + index = Column(Integer, primary_key=True) + date = Column(Date, primary_key=True) SentimentsMean = Column(Float) \ No newline at end of file diff --git a/TootCrawler.py b/TootCrawler.py index 1b081c2..fa131bf 100644 --- a/TootCrawler.py +++ b/TootCrawler.py @@ -1,48 +1,48 @@ -from langdetect import detect -import pytz -import pandas as pd -import re -from SentiTooter import SentiTooter -from pprint import pprint - -class TootCrawler(): - - def __init__(self, mastodonInstance) -> None: - self.mastodonInstance = mastodonInstance - self.compilePattern = re.compile('<.*?>') - self.sentiTooter = SentiTooter() - self.localTimezone = pytz.timezone('Europe/Berlin') - - def getLocalTimeline(self, minId=None): - return self.mastodonInstance.timeline_local(min_id=minId, limit=500) - - def cleanhtml(self, raw_html): - cleantext = re.sub(self.compilePattern, '', raw_html) - cleantext = re.sub(r'http\S+', '', cleantext) - return cleantext - - def buildTootsDataframe(self, minId=None): - toots = [] - allTimelineResults = [] - timelinePagination = self.getLocalTimeline(minId) - - while timelinePagination: - allTimelineResults = allTimelineResults + timelinePagination - timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination) - for i in allTimelineResults: - content = self.cleanhtml(i.content) - language = detect(content) - sentiment = self.sentiTooter.analyze(language, content) - toot = { - "sentiment": sentiment[0], - "model": sentiment[1], - "toot": content, - "datetime": i.created_at.astimezone(self.localTimezone), - "language": language, - "userName": i.account.display_name, - "userId": i.account.id, - "tootId": i.id - } - toots.append(toot) - toots.sort(key=lambda item:item.get('datetime')) +from langdetect import detect +import pytz +import pandas as pd +import re +from SentiTooter import SentiTooter +from pprint import pprint + +class TootCrawler(): + + def __init__(self, mastodonInstance) -> None: + self.mastodonInstance = mastodonInstance + self.compilePattern = re.compile('<.*?>') + self.sentiTooter = SentiTooter() + self.localTimezone = pytz.timezone('Europe/Berlin') + + def getLocalTimeline(self, minId=None): + return self.mastodonInstance.timeline_local(min_id=minId, limit=500) + + def cleanhtml(self, raw_html): + cleantext = re.sub(self.compilePattern, '', raw_html) + cleantext = re.sub(r'http\S+', '', cleantext) + return cleantext + + def buildTootsDataframe(self, minId=None): + toots = [] + allTimelineResults = [] + timelinePagination = self.getLocalTimeline(minId) + + while timelinePagination: + allTimelineResults = allTimelineResults + timelinePagination + timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination) + for i in allTimelineResults: + content = self.cleanhtml(i.content) + language = detect(content) + sentiment = self.sentiTooter.analyze(language, content) + toot = { + "sentiment": sentiment[0], + "model": sentiment[1], + "toot": content, + "datetime": i.created_at.astimezone(self.localTimezone), + "language": language, + "userName": i.account.display_name, + "userId": i.account.id, + "tootId": i.id + } + toots.append(toot) + toots.sort(key=lambda item:item.get('datetime')) return pd.DataFrame.from_records(toots) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 842b542..47f6d81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,12 @@ -mastodon.py -matplotlib -pandas -sqlalchemy -vader-multi -numpy -pytz -transformers \ No newline at end of file +mastodon.py +matplotlib +pandas +sqlalchemy +vader-multi +langdetect +numpy +pytz +transformers +wheel +germansentiment +scipy From 79f54079f7dabb9e526f41ca28401975d0155215 Mon Sep 17 00:00:00 2001 From: Robert Nasarek Date: Tue, 31 Jan 2023 17:51:06 +0100 Subject: [PATCH 02/10] fixed unrecognisable lang bug --- Main.py | 2 +- TootCrawler.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Main.py b/Main.py index 2b6768a..e6b7a3a 100644 --- a/Main.py +++ b/Main.py @@ -80,7 +80,7 @@ axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9)) # Line chart. lineChart = dataframe4LineChart.plot.line( ax=axes[1], - title='Mean of all sentiments from max positive (1) to min negative (-1)' + title='"Mean" of all sentiments. Please note that the sentiments are classified in a nominal scale: positive (1), neutral (0), and negative (-1) and NOT with compounds. Therefore this value indicates a tendency and not a correct statistical value.' ) axes[1].grid(True) axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)]) diff --git a/TootCrawler.py b/TootCrawler.py index fa131bf..a657a5d 100644 --- a/TootCrawler.py +++ b/TootCrawler.py @@ -31,7 +31,10 @@ class TootCrawler(): timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination) for i in allTimelineResults: content = self.cleanhtml(i.content) - language = detect(content) + try: + language = detect(content) + except: + language = None sentiment = self.sentiTooter.analyze(language, content) toot = { "sentiment": sentiment[0], From 8f7c57808779945a850fb7af9e7e1834dd7d7dfb Mon Sep 17 00:00:00 2001 From: Robert Nasarek Date: Wed, 15 Mar 2023 11:16:35 +0100 Subject: [PATCH 03/10] shortend description --- Main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Main.py b/Main.py index e6b7a3a..5dd227d 100644 --- a/Main.py +++ b/Main.py @@ -80,7 +80,7 @@ axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9)) # Line chart. lineChart = dataframe4LineChart.plot.line( ax=axes[1], - title='"Mean" of all sentiments. Please note that the sentiments are classified in a nominal scale: positive (1), neutral (0), and negative (-1) and NOT with compounds. Therefore this value indicates a tendency and not a correct statistical value.' + title='"Mean" of sentiments, calculated from nominal values, pos(1), neu (0), neg (-1)!' ) axes[1].grid(True) axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)]) @@ -93,6 +93,6 @@ axes[1].tick_params(which='minor', length=0) plotFileUrl = f'./plots/{TodayDate}.png' plt.savefig(plotFileUrl) -media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.") +media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}. Please note that the sentiments are classified in a nominal scale: positive (1), neutral (0), and negative (-1) and NOT with compounds. Therefore the mean indicates a tendency and not a correct statistical value.") mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en') From 3b677e5713621639b1131140c48f732a3668b4f0 Mon Sep 17 00:00:00 2001 From: rnsrk Date: Wed, 15 Mar 2023 13:21:44 +0100 Subject: [PATCH 04/10] underway to wordcount --- CRUDManager.py | 12 +++++++++++- Main.py | 8 ++++++-- SentiTooter.py | 17 +++-------------- requirements.txt | 5 ++++- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/CRUDManager.py b/CRUDManager.py index 0226fbf..e18a575 100644 --- a/CRUDManager.py +++ b/CRUDManager.py @@ -3,7 +3,6 @@ import pandas as pd from sqlalchemy import desc, select from Tables import Toots - def calculateSentimentCount(): query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount FROM Toots @@ -32,6 +31,17 @@ def calculateSentimentMean(dataframe): ] ) +def calculateWordCount(): + query = f'''SELECT DATE(datetime) as date, language, sentiment, toot + FROM Toots + WHERE datetime >= DATE("now","-1 day") + AND datetime < DATE("now")''' + return pd.read_sql( + query, + engine, + parse_dates=["datetime"] + ) + class CRUDManager(): def saveToDatabase(self, dataframe, table:str, useIndex=False): diff --git a/Main.py b/Main.py index a64a63c..e43e2ca 100644 --- a/Main.py +++ b/Main.py @@ -1,4 +1,4 @@ -from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean +from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, calculateWordCount from datetime import datetime, date from DbSetup import init_db import locale @@ -25,12 +25,16 @@ crudManager = CRUDManager() lastTootId = crudManager.getLastToot() tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId) -exit() + if not tootsDataframe.empty: crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False) else: print('Nothing changed since last database insert!') +wordCounts = calculateWordCount() +print(wordCounts); +print("exit programm") +exit() sentimentsYesterday = calculateSentimentCount() sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday) diff --git a/SentiTooter.py b/SentiTooter.py index 9626078..6aa1f92 100644 --- a/SentiTooter.py +++ b/SentiTooter.py @@ -41,15 +41,15 @@ class SentiTooter: output = self.enModel(**encoded_input) scores = output[0][0].detach().numpy() scores = softmax(scores) - print(scores) + #print(scores) sentimentIndexWithMaxScore = np.argmax(scores) sentimentLabel = self.labels[sentimentIndexWithMaxScore] sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment', max(scores)] - print(sentiment) + #print(sentiment) return sentiment case _: compound = self.sia.polarity_scores(content)['compound'] - print(self.sia.polarity_scores(content), 'vaderSentiment') + #print(self.sia.polarity_scores(content), 'vaderSentiment') if compound > (1 / 3): return ['positive', 'vaderSentiment'] elif compound < (-1 / 3): @@ -58,7 +58,6 @@ class SentiTooter: return ['neutral', 'vaderSentiment'] - def initModel(self): # PT tokenizer = AutoTokenizer.from_pretrained(self.enModelType) @@ -66,13 +65,3 @@ class SentiTooter: model = AutoModelForSequenceClassification.from_pretrained(self.enModelType) model.save_pretrained(self.enModelType) return model, tokenizer - - # # TF - # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL) - # model.save_pretrained(MODEL) - - # text = "Good night 😊" - # encoded_input = tokenizer(text, return_tensors='tf') - # output = model(encoded_input) - # scores = output[0][0].numpy() - # scores = softmax(scores) diff --git a/requirements.txt b/requirements.txt index 842b542..d280535 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,7 @@ sqlalchemy vader-multi numpy pytz -transformers \ No newline at end of file +transformers +langdetect +germansentiment +scipy \ No newline at end of file From 6a8caac29efed1bde02575f6804ec650d3fe02f4 Mon Sep 17 00:00:00 2001 From: rnsrk Date: Wed, 15 Mar 2023 14:27:07 +0100 Subject: [PATCH 05/10] implement rough wordcount --- CRUDManager.py | 8 ++++---- Main.py | 8 ++++++-- SentiTooter.py | 38 +++++++++++++++++++++++++++++++++++--- requirements.txt | 2 ++ 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/CRUDManager.py b/CRUDManager.py index ea7e7e5..dccdf00 100644 --- a/CRUDManager.py +++ b/CRUDManager.py @@ -31,14 +31,14 @@ def calculateSentimentMean(dataframe): ] ) -def calculateWordCount(): - query = f'''SELECT DATE(datetime) as date, language, sentiment, toot +def getYesterdaysToots(): + query = f'''SELECT datetime as date, language, sentiment, toot FROM Toots WHERE datetime >= DATE("now","-1 day") AND datetime < DATE("now")''' return pd.read_sql( - query, - engine, + sql.text(query), + connection, parse_dates=["datetime"] ) diff --git a/Main.py b/Main.py index 56ba6b7..2af6e60 100644 --- a/Main.py +++ b/Main.py @@ -1,4 +1,4 @@ -from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, calculateWordCount +from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots from datetime import datetime, date from DbSetup import init_db import locale @@ -6,6 +6,7 @@ from MastodonAccountManager import MastodonAccountManager import matplotlib.pyplot as plt import matplotlib.dates as mdates from TootCrawler import TootCrawler +from SentiTooter import translateToots, countWords locale.setlocale(locale.LC_TIME, "en_US.UTF-8") init_db() @@ -31,7 +32,10 @@ if not tootsDataframe.empty: else: print('Nothing changed since last database insert!') -wordCounts = calculateWordCount() +yesterdaysToots = getYesterdaysToots() +translatedToots = translateToots(yesterdaysToots) +tootsSeries = translatedToots.toot +wordCounts = countWords(tootsSeries.str.cat(sep=' '), 10) print(wordCounts); print("exit programm") exit() diff --git a/SentiTooter.py b/SentiTooter.py index 6aa1f92..d5f22ef 100644 --- a/SentiTooter.py +++ b/SentiTooter.py @@ -4,15 +4,17 @@ from scipy.special import softmax from transformers import AutoModelForSequenceClassification from transformers import AutoTokenizer from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer - +from deep_translator import GoogleTranslator +import spacy +from collections import Counter # Preprocess text (username and link placeholders) def preprocess(text): new_text = [] for t in text.split(" "): - t = '@user' if t.startswith('@') and len(t) > 1 else t - t = 'http' if t.startswith('http') else t + t = '' if t.startswith('@') and len(t) > 1 else t + t = '' if t.startswith('http') else t new_text.append(t) return " ".join(new_text) @@ -65,3 +67,33 @@ class SentiTooter: model = AutoModelForSequenceClassification.from_pretrained(self.enModelType) model.save_pretrained(self.enModelType) return model, tokenizer + +def translateToots(yesterdaysToots): + yesterdaysTootsTranslated = yesterdaysToots + for index, row in yesterdaysTootsTranslated.iterrows(): + if (row['language'] != 'de'): + try: + yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot']) + yesterdaysTootsTranslated.at[index,'language'] = 'de' + except: + yesterdaysTootsTranslated.drop(index) + return yesterdaysTootsTranslated + +def translateToot(language, toot): + content = preprocess(toot) + return GoogleTranslator(source=language, target='de').translate(content) + +def countWords(concatedToots, count): + nlp = spacy.load('de_core_news_sm') + doc = nlp(concatedToots) + + # noun tokens that arent stop words or punctuations + nouns = [token.text + for token in doc + if (not token.is_stop and + not token.is_punct and + token.pos_ == "NOUN")] + + # five most common noun tokens + noun_freq = Counter(nouns) + return noun_freq.most_common(count) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 2cf3aab..bc6906e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,5 @@ transformers wheel germansentiment scipy +deep_translator +spacy \ No newline at end of file From 4479bd24293143da9a9ea9439c9443afbe1f9997 Mon Sep 17 00:00:00 2001 From: rnsrk Date: Wed, 15 Mar 2023 16:02:47 +0100 Subject: [PATCH 06/10] implement word counts. --- Main.py | 41 ++++++++++++++++++++++++++++------------- SentiTooter.py | 24 +++++++++++++++++++----- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/Main.py b/Main.py index 2af6e60..2e7908f 100644 --- a/Main.py +++ b/Main.py @@ -6,11 +6,12 @@ from MastodonAccountManager import MastodonAccountManager import matplotlib.pyplot as plt import matplotlib.dates as mdates from TootCrawler import TootCrawler -from SentiTooter import translateToots, countWords +from SentiTooter import translateToots, createWordCountPerSentiment locale.setlocale(locale.LC_TIME, "en_US.UTF-8") init_db() +print('Initialize Mastodon...') mastodonAccountManager = MastodonAccountManager() mastodonInstance = mastodonAccountManager.instance """ @@ -20,34 +21,47 @@ mastodonInstance.log_in( to_file = 'hedonodon_usercred.secret' ) """ +print('done!') +print('Fetching recent toots...') tootCrawler = TootCrawler(mastodonInstance) crudManager = CRUDManager() - lastTootId = crudManager.getLastToot() tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId) +print('done!') +print('Save toots to database...') if not tootsDataframe.empty: crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False) + print('done!') else: - print('Nothing changed since last database insert!') + print('nothing changed since last database insert!') +print('Calculate word counts...') yesterdaysToots = getYesterdaysToots() translatedToots = translateToots(yesterdaysToots) -tootsSeries = translatedToots.toot -wordCounts = countWords(tootsSeries.str.cat(sep=' '), 10) -print(wordCounts); -print("exit programm") -exit() -sentimentsYesterday = calculateSentimentCount() -sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday) +wordCountsPerSentiment = createWordCountPerSentiment(translatedToots) +print('done!') +print(wordCountsPerSentiment); + +print('Calculate sentiment counts...') +sentimentsYesterday = calculateSentimentCount() +print('done!') + +print('Calculate sentiment mean...') +sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday) +print('done!') + +print('Save calculations to database...') if not tootsDataframe.empty: crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True) crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True) + print('done!') else: - print('Nothing changed since last database insert!') + print('nothing changed since last database insert!') +print('Create figure...') colormap = { 'negative': '#ff9999', 'neutral': '#ffcc99', @@ -100,7 +114,8 @@ axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h')) axes[1].tick_params(which='minor', length=0) plotFileUrl = f'./plots/{TodayDate}.png' plt.savefig(plotFileUrl) +print('done!') -#media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.") -#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en') +media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.") +mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en') diff --git a/SentiTooter.py b/SentiTooter.py index d5f22ef..28a5623 100644 --- a/SentiTooter.py +++ b/SentiTooter.py @@ -71,20 +71,20 @@ class SentiTooter: def translateToots(yesterdaysToots): yesterdaysTootsTranslated = yesterdaysToots for index, row in yesterdaysTootsTranslated.iterrows(): - if (row['language'] != 'de'): + if (row['language'] != 'en'): try: yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot']) - yesterdaysTootsTranslated.at[index,'language'] = 'de' + yesterdaysTootsTranslated.at[index,'language'] = 'en' except: yesterdaysTootsTranslated.drop(index) return yesterdaysTootsTranslated def translateToot(language, toot): content = preprocess(toot) - return GoogleTranslator(source=language, target='de').translate(content) + return GoogleTranslator(source=language, target='en').translate(content) def countWords(concatedToots, count): - nlp = spacy.load('de_core_news_sm') + nlp = spacy.load('en_core_web_md') doc = nlp(concatedToots) # noun tokens that arent stop words or punctuations @@ -96,4 +96,18 @@ def countWords(concatedToots, count): # five most common noun tokens noun_freq = Counter(nouns) - return noun_freq.most_common(count) \ No newline at end of file + return noun_freq.most_common(count) + +def createWordCountPerSentiment(translatedToots): + sentimentList = [] + for sentiment in ['positive', 'neutral', 'negative']: + tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot + wordCounts = countWords(tootsSeries.str.cat(sep=' '), 5) + countList = [] + for count in wordCounts: + countList.append(str(count[0]) + ' (' + str(count[1]) + ')') + list2String = ', '.join(countList) + sentimentString = sentiment + ': ' + list2String + sentimentList.append(sentimentString) + wordCountsPerSentiments = '\n'.join(sentimentList) + return wordCountsPerSentiments \ No newline at end of file From bc842244c728345977e82f44534b2203809f809a Mon Sep 17 00:00:00 2001 From: rnsrk Date: Fri, 17 Mar 2023 20:06:01 +0100 Subject: [PATCH 07/10] add code documentation --- CRUDManager.py | 73 +++++++++++++++++++++++-- DbSetup.py | 6 ++ Main.py | 27 +++++++-- MastodonAccountManager.py | 2 + SentiTooter.py | 112 +++++++++++++++++++++++++++++++++----- Tables.py | 4 +- TootCrawler.py | 68 +++++++++++++++++++++-- 7 files changed, 261 insertions(+), 31 deletions(-) diff --git a/CRUDManager.py b/CRUDManager.py index dccdf00..cd0c7e0 100644 --- a/CRUDManager.py +++ b/CRUDManager.py @@ -3,7 +3,19 @@ import pandas as pd from sqlalchemy import desc, select, sql from Tables import Toots +from pandas.core.api import ( + DataFrame) + def calculateSentimentCount(): + """Calculates the frequencies of the sentiments. + + Returns + ------- + DataFrame + Containing date (YY-MM-DD), sentiment (positive, neutral, negative), + and sentimentCount. + """ + query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount FROM Toots GROUP BY DATE(datetime), @@ -16,7 +28,18 @@ def calculateSentimentCount(): parse_dates=["datetime"] ) -def calculateSentimentMean(dataframe): +def calculateSentimentMean(dataframe:DataFrame) -> DataFrame: + """Calculates the mean of the sentiments. + + Parameters + ------- + dataframe: DataFrame + + Returns + ------- + Dataframe + Containing date (YY-MM-DD), sentimentsMean. + """ negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1 positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum() sentimentSum = dataframe['sentimentCount'].sum() @@ -31,7 +54,14 @@ def calculateSentimentMean(dataframe): ] ) -def getYesterdaysToots(): +def getYesterdaysToots() -> DataFrame: + """Fetches yesterdays toots from database. + + Returns + ------- + pd.Dataframe + Containing date (YY-MM-DD), language, sentiment, toot. + """ query = f'''SELECT datetime as date, language, sentiment, toot FROM Toots WHERE datetime >= DATE("now","-1 day") @@ -43,16 +73,49 @@ def getYesterdaysToots(): ) class CRUDManager(): + """Class for database operations""" - def saveToDatabase(self, dataframe, table:str, useIndex=False): + def saveToDatabase(self, dataframe:DataFrame, table:str, useIndex=False): + """Saves dataframe to database. + + Parameters + ------- + dataframe: DataFrame + Input dataframe. + table: str + Table, where to save the data. + useIndex: boolean + Should the index of the dataframe be used as index for + the database table? + """ try: dataframe.to_sql(table, engine, index=useIndex, if_exists="append") except: print(f'Could not save data to {table}!') - def loadFromDatabase(self, table:str, indexColumn=None): + def loadFromDatabase(self, table:str, indexColumn=None) -> DataFrame: + """Load a table into a dataframe. + + Parameters + ------- + table: str + Table, where to save the data. + indexColumn: str | None + Should the index of the table be used as index for + the dataframe? + Returns + ------- + DataFrame + """ return pd.read_sql_table(table, connection, index_col=indexColumn) - def getLastToot(self): + def getLastToot(self) -> str: + """Query the last toot id from database. + + Results + ------- + str + A toot id. + """ stmt = select(Toots.tootId).order_by(desc('datetime')) return session.scalars(stmt).first() diff --git a/DbSetup.py b/DbSetup.py index c787928..b6179f7 100644 --- a/DbSetup.py +++ b/DbSetup.py @@ -1,3 +1,7 @@ +"""Script to initialize the database. + Serves database url, engine, connection and session. +""" + from sqlalchemy import create_engine from sqlalchemy.orm import Session from sqlalchemy.ext.declarative import declarative_base @@ -9,4 +13,6 @@ session = Session(engine) Base = declarative_base() def init_db(): + """Initialize the database. + """ Base.metadata.create_all(bind=engine) diff --git a/Main.py b/Main.py index 2e7908f..5bb6532 100644 --- a/Main.py +++ b/Main.py @@ -1,3 +1,19 @@ +""" +Hedonodon toot sentiment analyzer. + +This programm fetches toots from the fedihum.org Mastodon instance, calculates +the frequencies of the sentiments (positive, neutral, negative) and the mean +from these nominal values (even this is not statistical correct (;-_-)!, but +not all analyzer return compounds). +It also calculates the word count of the nouns per sentiment. + +It uses germansentiment for german toots, twitter-roberta-base-sentiment for +english toots, and vaderSentiment for other languages. + +For the word counts I translate the toots to english with the GoogleTranslator +first. +""" + from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots from datetime import datetime, date from DbSetup import init_db @@ -6,7 +22,7 @@ from MastodonAccountManager import MastodonAccountManager import matplotlib.pyplot as plt import matplotlib.dates as mdates from TootCrawler import TootCrawler -from SentiTooter import translateToots, createWordCountPerSentiment +from SentiTooter import translateToots, createWordFrequenciesPerSentiment locale.setlocale(locale.LC_TIME, "en_US.UTF-8") init_db() @@ -40,7 +56,7 @@ else: print('Calculate word counts...') yesterdaysToots = getYesterdaysToots() translatedToots = translateToots(yesterdaysToots) -wordCountsPerSentiment = createWordCountPerSentiment(translatedToots) +wordCountsPerSentiment = createWordFrequenciesPerSentiment(translatedToots) print('done!') print(wordCountsPerSentiment); @@ -116,6 +132,7 @@ plotFileUrl = f'./plots/{TodayDate}.png' plt.savefig(plotFileUrl) print('done!') -media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.") -mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en') - +print('Send toot...') +#media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.") +#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en') +print('done!') diff --git a/MastodonAccountManager.py b/MastodonAccountManager.py index 68d62bb..957ac4e 100644 --- a/MastodonAccountManager.py +++ b/MastodonAccountManager.py @@ -1,5 +1,7 @@ from mastodon import Mastodon class MastodonAccountManager(): + """Initialize the Mastodon account. + """ def __init__(self): self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret') diff --git a/SentiTooter.py b/SentiTooter.py index 28a5623..b5c1d6a 100644 --- a/SentiTooter.py +++ b/SentiTooter.py @@ -1,4 +1,5 @@ from germansentiment import SentimentModel +from pandas import DataFrame import numpy as np from scipy.special import softmax from transformers import AutoModelForSequenceClassification @@ -9,7 +10,18 @@ import spacy from collections import Counter # Preprocess text (username and link placeholders) -def preprocess(text): +def preprocess(text:str) -> str: + """Removes tags and urls from text. + + Parameters + ------ + text: str + The raw toot from Mastodon. + Returns + ------ + str + The cleaned text. + """ new_text = [] for t in text.split(" "): @@ -20,9 +32,12 @@ def preprocess(text): class SentiTooter: - """""" + """Class to analyze the toots. + """ def __init__(self): + """Initilize the sentiment models and labels. + """ self.deModel = SentimentModel() self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment" self.enModel, self.enTokenizer = self.initModel() @@ -30,7 +45,20 @@ class SentiTooter: self.labels = ['negative', 'neutral', 'positive'] self.sia = SentimentIntensityAnalyzer() - def analyze(self, language, content): + def analyze(self, language:str, content:str) -> list[str, str, float]: + """Analyzes the sentiments of the toots. + + Parameters + ------ + language: str + The language tag of the toot. + content: str + The toot content. + Returns + ------ + list[str, str, float] + A list with the sentiment, analyzer type, and sentiment score. + """ match language: case 'de': sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True) @@ -61,6 +89,13 @@ class SentiTooter: def initModel(self): + """Initialize the english models. + + Returns + ------ + tupel + The pretrained model and tokenizer. + """ # PT tokenizer = AutoTokenizer.from_pretrained(self.enModelType) tokenizer.save_pretrained(self.enModelType) @@ -68,7 +103,14 @@ class SentiTooter: model.save_pretrained(self.enModelType) return model, tokenizer -def translateToots(yesterdaysToots): +def translateToots(yesterdaysToots:DataFrame) -> DataFrame: + """Translates all toots to english. + + Returns + ------ + Dataframe + Containing the english translated toots. + """ yesterdaysTootsTranslated = yesterdaysToots for index, row in yesterdaysTootsTranslated.iterrows(): if (row['language'] != 'en'): @@ -79,11 +121,39 @@ def translateToots(yesterdaysToots): yesterdaysTootsTranslated.drop(index) return yesterdaysTootsTranslated -def translateToot(language, toot): +def translateToot(language:str, toot:str) -> str: + """Translate a toot in english. + + Parameters + ------ + language:str + The language of the toot. + toot: str + The toot content. + + Returns + ------ + str + The in english translated toot. + """ content = preprocess(toot) return GoogleTranslator(source=language, target='en').translate(content) -def countWords(concatedToots, count): +def countWords(concatedToots: str, number: int) -> list: + """Counts the word frequencies in all toots of a given sentiment. + + Parameters + ------ + concatedToots: str + All toots from a sentiment. + number: int + Number of words to calculate word frequencies. + + Returns + ------ + list + List containing tuple of word and word frequency. + """ nlp = spacy.load('en_core_web_md') doc = nlp(concatedToots) @@ -96,18 +166,30 @@ def countWords(concatedToots, count): # five most common noun tokens noun_freq = Counter(nouns) - return noun_freq.most_common(count) + return noun_freq.most_common(number) -def createWordCountPerSentiment(translatedToots): +def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str: + """Count all word frequencies of all toots per sentiment. + + Paramters + ------ + translatedToots: DataFrame + The dataframe with all toots in english. + + Returns + ------ + str + Containing words and wourd counts per sentiment. + """ sentimentList = [] for sentiment in ['positive', 'neutral', 'negative']: tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot - wordCounts = countWords(tootsSeries.str.cat(sep=' '), 5) - countList = [] - for count in wordCounts: - countList.append(str(count[0]) + ' (' + str(count[1]) + ')') - list2String = ', '.join(countList) + wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5) + FrequenciesList = [] + for Frequencies in wordFrequencies: + FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')') + list2String = ', '.join(FrequenciesList) sentimentString = sentiment + ': ' + list2String sentimentList.append(sentimentString) - wordCountsPerSentiments = '\n'.join(sentimentList) - return wordCountsPerSentiments \ No newline at end of file + wordFrequenciessPerSentiments = '\n'.join(sentimentList) + return wordFrequenciessPerSentiments \ No newline at end of file diff --git a/Tables.py b/Tables.py index c64178a..071179a 100644 --- a/Tables.py +++ b/Tables.py @@ -1,3 +1,5 @@ +"""This script containing the table definitions for the database.""" + from DbSetup import Base from sqlalchemy import Column, Date, Integer, Float, String @@ -14,8 +16,6 @@ class Toots(Base): userName = Column(String(255)) userId = Column(String(255)) - - class SentimentCounts(Base): __tablename__ = 'SentimentCounts' __table_args__ = {'extend_existing': True} diff --git a/TootCrawler.py b/TootCrawler.py index a657a5d..67cd722 100644 --- a/TootCrawler.py +++ b/TootCrawler.py @@ -1,27 +1,87 @@ from langdetect import detect import pytz import pandas as pd +from pandas import DataFrame import re from SentiTooter import SentiTooter from pprint import pprint class TootCrawler(): + """Class to fetch the recent toots from fedihum.org.""" - def __init__(self, mastodonInstance) -> None: + def __init__(self, mastodonInstance: any) -> None: + """Initialize the Mastodon instance and depending classes. + + Parameters + ------ + mastodonInstance: any + The initialized Mastodon instance. + """ self.mastodonInstance = mastodonInstance self.compilePattern = re.compile('<.*?>') self.sentiTooter = SentiTooter() self.localTimezone = pytz.timezone('Europe/Berlin') - def getLocalTimeline(self, minId=None): + def getLocalTimeline(self, minId=None) -> any: + """Receave the local timeline + + Parameters + ------ + minId: str | None + The last fetched toot id from the database. + + Returns + ------ + any + The local Mastodon timeline from fedihum.org. + """ return self.mastodonInstance.timeline_local(min_id=minId, limit=500) - def cleanhtml(self, raw_html): + def cleanhtml(self, raw_html:str) -> str: + """remove brackets and http string from toots + + Parameters + ------ + raw_html: str + The toot content. + Returns + ------ + str: + The cleaned toot content. + """ cleantext = re.sub(self.compilePattern, '', raw_html) cleantext = re.sub(r'http\S+', '', cleantext) return cleantext - def buildTootsDataframe(self, minId=None): + def buildTootsDataframe(self, minId=None) -> DataFrame: + """Parse fetched toots from Mastodon to dataframe. + + Parameters + ------ + minId: str | None + The id of the last fetched toot. + + Returns + ------ + DataFrame + A Dataframe containing + sentiment: str + The sentiment (positive, neutral, negative) + model: str + The used sentiment model. + toot: str + The content of the toot. + datetime: datetime + The datetime of the toot. + language: str + The langage flag of the toot. + userName: str. + The user name of the toot. + userId: str + The user id. + tootId: str + The toot id. + """ toots = [] allTimelineResults = [] timelinePagination = self.getLocalTimeline(minId) From 8d9a7fa603e898eff857a35d06fa8690dec29ed4 Mon Sep 17 00:00:00 2001 From: rnsrk Date: Fri, 17 Mar 2023 21:25:44 +0100 Subject: [PATCH 08/10] take the large spacy model --- SentiTooter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SentiTooter.py b/SentiTooter.py index b5c1d6a..aabc498 100644 --- a/SentiTooter.py +++ b/SentiTooter.py @@ -154,7 +154,7 @@ def countWords(concatedToots: str, number: int) -> list: list List containing tuple of word and word frequency. """ - nlp = spacy.load('en_core_web_md') + nlp = spacy.load('en_core_web_lg') doc = nlp(concatedToots) # noun tokens that arent stop words or punctuations From cafda77e7fdea42ff2577b2a00c2c0d7ac818b11 Mon Sep 17 00:00:00 2001 From: rnsrk Date: Fri, 17 Mar 2023 21:26:14 +0100 Subject: [PATCH 09/10] Updated the README --- README.md | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5440dc1..468a872 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,19 @@ # Hedonodon -I'm using [vaderSentiment](https://pypi.org/project/vaderSentiment/) to calculate the compounds. +## Prerequisites +Install the dependencies with `python -m pip install -r requirements.txt`. +Install SpaCys nlp model with `python -m spacy download en_core_web_lg`. +If the automatic download of the twitter-roberta-base-sentiment model and tokenizer fail, go to the model pages on hugging face (see models section) and download the to the respective folder (cardiffnlp/twitter-roberta-base-sentiment) -More Documentation coming soon! \ No newline at end of file +## Purpose +Hedonodon fetched toots from fedihum.org and calculates the sentiments, sentiment mean and word frequencies of each day, and creates fancy diagrams from the data. + +## Motivation +This tool was created to understand how sentiment analyses and nlp methods works, so it may lacks of proper use of models etc... + +## Models +It uses "germansentiment"](https://huggingface.co/oliverguhr/german-sentiment-bert) for german toots, []"twitter-roberta-base-sentiment"](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment) for +english toots, and ["vaderSentiment"](https://pypi.org/project/vaderSentiment/) for other languages. +For the word counts I translate the toots to english with the GoogleTranslator from [deep_translater](https://pypi.org/project/deep-translator/) first and then use SpaCys nlp model ["en_core_web_lg"](https://spacy.io/models/en/) to calculate the word frequencies. + +## Weaknesses +Since some moduls do not return sentiment compounds I have to use the nominal sentiment values (positive, neutral, negative) to calculate the mean of the day, which is statisticaly not okay (;-_-). \ No newline at end of file From 03792f21204df2d17a031a6fcd4ea95ba2423d51 Mon Sep 17 00:00:00 2001 From: rnsrk Date: Fri, 17 Mar 2023 21:29:16 +0100 Subject: [PATCH 10/10] Fixed some typos --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 468a872..58d02b6 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,10 @@ ## Prerequisites Install the dependencies with `python -m pip install -r requirements.txt`. Install SpaCys nlp model with `python -m spacy download en_core_web_lg`. -If the automatic download of the twitter-roberta-base-sentiment model and tokenizer fail, go to the model pages on hugging face (see models section) and download the to the respective folder (cardiffnlp/twitter-roberta-base-sentiment) +If the automatic download of the twitter-roberta-base-sentiment model and tokenizer fail, go to the model pages on hugging face (see models section) and download the to the respective folder (cardiffnlp/twitter-roberta-base-sentiment). ## Purpose -Hedonodon fetched toots from fedihum.org and calculates the sentiments, sentiment mean and word frequencies of each day, and creates fancy diagrams from the data. +Hedonodon fetch toots from fedihum.org and calculates the sentiments, sentiment mean and word frequencies of each day, and creates fancy diagrams from the data. ## Motivation This tool was created to understand how sentiment analyses and nlp methods works, so it may lacks of proper use of models etc...