diff --git a/.gitignore b/.gitignore index 22d268b..d946327 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,8 @@ instance __pycache__ hedonodon_clientcred.secret hedonodon_usercred.secret -.fleet \ No newline at end of file +.fleet +test.py +.idea +cardiffnlp +venv \ No newline at end of file diff --git a/CRUDManager.py b/CRUDManager.py index 4211035..0226fbf 100644 --- a/CRUDManager.py +++ b/CRUDManager.py @@ -3,6 +3,35 @@ import pandas as pd from sqlalchemy import desc, select from Tables import Toots + +def calculateSentimentCount(): + query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount + FROM Toots + GROUP BY DATE(datetime), + sentiment + HAVING datetime >= DATE("now","-1 day") + AND datetime < DATE("now")''' + return pd.read_sql( + query, + databaseUrl, + parse_dates=["datetime"] + ) + +def calculateSentimentMean(dataframe): + negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1 + positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum() + sentimentSum = dataframe['sentimentCount'].sum() + sentimentMean = (negativeSentimentSum + positiveSentimentSum) / sentimentSum + sentimentDate = dataframe.loc[0]['date'] + return pd.DataFrame.from_records( + [ + { + 'date': sentimentDate, + 'sentimentsMean': sentimentMean + } + ] + ) + class CRUDManager(): def saveToDatabase(self, dataframe, table:str, useIndex=False): @@ -16,21 +45,4 @@ class CRUDManager(): def getLastToot(self): stmt = select(Toots.tootId).order_by(desc('datetime')) - return session.scalars(stmt).first() - - def calculateAggregates(self, column, aggregate='Count'): - if (aggregate=='Count'): - addGroup = f', {column} ' - else: - addGroup = '' - query = f'''SELECT DATE(datetime) as date {addGroup}, {aggregate}({column}) as {column}{aggregate} - FROM Toots - GROUP BY DATE(datetime)''' \ - + addGroup \ - + '''HAVING datetime >= DATE("now","-1 day") - AND datetime < DATE("now")''' - return pd.read_sql( - query, - databaseUrl, - parse_dates=["datetime"] - ) + return session.scalars(stmt).first() \ No newline at end of file diff --git a/Main.py b/Main.py index cd5ee0d..879e487 100644 --- a/Main.py +++ b/Main.py @@ -1,12 +1,10 @@ -from CRUDManager import CRUDManager +from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean from datetime import datetime, date from DbSetup import init_db import locale from MastodonAccountManager import MastodonAccountManager import matplotlib.pyplot as plt import matplotlib.dates as mdates -from matplotlib.ticker import MultipleLocator -import numpy as np from TootCrawler import TootCrawler locale.setlocale(locale.LC_TIME, "en_EN.UTF-8") @@ -27,31 +25,38 @@ crudManager = CRUDManager() lastTootId = crudManager.getLastToot() tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId) -sentimentsYesterday = crudManager.calculateAggregates('sentiment', 'Count') + +if not tootsDataframe.empty: + crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False) +else: + print('Nothing changed since last database insert!') + +sentimentsYesterday = calculateSentimentCount() +sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday) + +if not tootsDataframe.empty: + crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True) + crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True) +else: + print('Nothing changed since last database insert!') colormap = { - 'negative"': '#ff9999', + 'negative': '#ff9999', 'neutral': '#ffcc99', "positive": '#99ff99' } todaysColors = [] for sentiment in sentimentsYesterday['sentiment'].to_numpy(): - todaysColors.append(colormap[sentiment]) + todaysColors.append(colormap[sentiment]) -compoundsYesterday = crudManager.calculateAggregates('compound', 'Avg') -if not tootsDataframe.empty: - crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False) - crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='Sentiments', useIndex=True) - crudManager.saveToDatabase(dataframe=compoundsYesterday, table='Compounds', useIndex=True) -else: - print('Nothing changed since last database insert!') -TodayDate= datetime.strptime(sentimentsYesterday['date'][0], '%Y-%m-%d').strftime('%d.%m.%Y') + +TodayDate = datetime.strptime(sentimentsYesterday['date'][0], '%Y-%m-%d').strftime('%d.%m.%Y') dataframe4PieChart = sentimentsYesterday.drop('date', axis=1).set_index('sentiment') -dataframe4LineChart = crudManager.loadFromDatabase('Compounds', 'date').drop('index', axis=1) +dataframe4LineChart = crudManager.loadFromDatabase('SentimentMeans', 'date').drop('index', axis=1) -fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10,10)) +fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10)) # Pie chart. pieChartlabels = dataframe4PieChart.index.to_numpy() @@ -61,24 +66,22 @@ pieChart = dataframe4PieChart.plot.pie( ylabel="", labels=dataframe4PieChart['sentimentCount'], title=f'Moods of the toots on {TodayDate} of the local timeline on fedihum.org', - colors = todaysColors, + colors=todaysColors, wedgeprops=dict(linewidth=3, edgecolor='w'), startangle=90 ) axes[0].axis('equal') -centre_circle = plt.Circle((0,0),0.6,fc='white') +centre_circle = plt.Circle((0, 0), 0.6, fc='white') axes[0].add_patch(centre_circle) chartBox = axes[0].get_position() -axes[0].set_position([chartBox.x0,chartBox.y0-0.2,chartBox.width,chartBox.height]) -axes[0].legend(pieChartlabels,loc='upper right', bbox_to_anchor=(0.8, 0.9)) - +axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9)) # Line chart. lineChart = dataframe4LineChart.plot.line( ax=axes[1], - title='Compounds from max positive (1) to min negative (-1)' - ) + title='Mean of all sentiments from max positive (1) to min negative (-1)' +) axes[1].grid(True) axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)]) axes[1].set_ylim([-1, 1]) @@ -88,8 +91,9 @@ axes[1].xaxis.set_major_formatter(plt.NullFormatter()) axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h')) axes[1].tick_params(which='minor', length=0) plotFileUrl = f'./plots/{TodayDate}.png' -plt.show() plt.savefig(plotFileUrl) +""" media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the compounds up to {TodayDate}.") mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en') +""" diff --git a/MastodonAccountManager.py b/MastodonAccountManager.py index 6542659..9c51e54 100644 --- a/MastodonAccountManager.py +++ b/MastodonAccountManager.py @@ -2,4 +2,4 @@ from mastodon import Mastodon class MastodonAccountManager(): def __init__(self): - self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret') + self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret') diff --git a/SentiTooter.py b/SentiTooter.py index b745c2f..1d89ab6 100644 --- a/SentiTooter.py +++ b/SentiTooter.py @@ -1,19 +1,74 @@ -from math import sqrt -from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer +from germansentiment import SentimentModel import numpy as np +from scipy.special import softmax +from transformers import AutoModelForSequenceClassification +from transformers import AutoTokenizer +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer -class SentiTooter(): + +# Preprocess text (username and link placeholders) +def preprocess(text): + new_text = [] + + for t in text.split(" "): + t = '@user' if t.startswith('@') and len(t) > 1 else t + t = 'http' if t.startswith('http') else t + new_text.append(t) + return " ".join(new_text) + + +class SentiTooter: """""" + def __init__(self): + self.deModel = SentimentModel() + self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment" + self.enModel, self.enTokenizer = self.initModel() + # https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt + self.labels = ['negative', 'neutral', 'positive'] self.sia = SentimentIntensityAnalyzer() - def analyze(self, toot): - compound = self.sia.polarity_scores(toot.content)['compound'] - if (compound > (1/3)): - return ['positive', compound] - elif (compound < (-1/3)): - return ['negative', compound] - else: - return ['neutral', compound] + match toot.language: + case 'de': + sentiment = self.deModel.predict_sentiment([toot.content]) + sentiment.append('germanSentiment') + return sentiment + case 'en': + text = preprocess(toot.content) + encoded_input = self.enTokenizer(text, return_tensors='pt') + output = self.enModel(**encoded_input) + scores = output[0][0].detach().numpy() + scores = softmax(scores) + sentimentIndexWithMaxScore = np.argmax(scores) + sentimentLabel = self.labels[sentimentIndexWithMaxScore] + sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment'] + return sentiment + case _: + compound = self.sia.polarity_scores(toot.content)['compound'] + if compound > (1 / 3): + return ['positive', 'vaderSentiment'] + elif compound < (-1 / 3): + return ['negative', 'vaderSentiment'] + else: + return ['neutral', 'vaderSentiment'] + + + def initModel(self): + # PT + tokenizer = AutoTokenizer.from_pretrained(self.enModelType) + tokenizer.save_pretrained(self.enModelType) + model = AutoModelForSequenceClassification.from_pretrained(self.enModelType) + model.save_pretrained(self.enModelType) + return model, tokenizer + + # # TF + # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL) + # model.save_pretrained(MODEL) + + # text = "Good night 😊" + # encoded_input = tokenizer(text, return_tensors='tf') + # output = model(encoded_input) + # scores = output[0][0].numpy() + # scores = softmax(scores) diff --git a/Tables.py b/Tables.py index 4ae477f..78aa412 100644 --- a/Tables.py +++ b/Tables.py @@ -5,7 +5,7 @@ class Toots(Base): __tablename__ = 'Toots' __table_args__ = {'extend_existing': True} index = Column(Integer, primary_key=True) - compound = Column(Float) + model = Column(String(30)) datetime = Column(Date) language = Column(String(3)) sentiment = Column(String(8)) @@ -16,18 +16,17 @@ class Toots(Base): -class Sentiments(Base): - __tablename__ = 'Sentiments' +class SentimentCounts(Base): + __tablename__ = 'SentimentCounts' __table_args__ = {'extend_existing': True} index = Column(Integer, primary_key=True) sentimentCount = Column(Integer) - date = Column(Date, primary_key = True) + date = Column(Date, primary_key=True) sentiment = Column(String(8)) - -class Compounds(Base): - __tablename__ = 'Compounds' +class SentimentMeans(Base): + __tablename__ = 'SentimentMeans' __table_args__ = {'extend_existing': True} index = Column(Integer, primary_key=True) - date = Column(Date, primary_key = True) - compoundAvg = Column(Float) \ No newline at end of file + date = Column(Date, primary_key=True) + SentimentsMean = Column(Float) \ No newline at end of file diff --git a/TootCrawler.py b/TootCrawler.py index 47b87f0..5d51b75 100644 --- a/TootCrawler.py +++ b/TootCrawler.py @@ -1,10 +1,10 @@ +from langdetect import detect import pytz import pandas as pd import re from SentiTooter import SentiTooter from pprint import pprint - class TootCrawler(): def __init__(self, mastodonInstance) -> None: @@ -13,29 +13,34 @@ class TootCrawler(): self.sentiTooter = SentiTooter() self.localTimezone = pytz.timezone('Europe/Berlin') - def getLocalTimeline(self, sinceId=None): - return self.mastodonInstance.timeline_local(since_id=sinceId) + def getLocalTimeline(self, minId=None): + return self.mastodonInstance.timeline_local(min_id=minId, limit=500) def cleanhtml(self, raw_html): cleantext = re.sub(self.compilePattern, '', raw_html) cleantext = re.sub(r'http\S+', '', cleantext) return cleantext - def buildTootsDataframe(self, sinceId=None): + def buildTootsDataframe(self, minId=None): toots = [] + allTimelineResults = [] + timelinePagination = self.getLocalTimeline(minId) - for i in self.getLocalTimeline(sinceId): + while timelinePagination: + allTimelineResults = allTimelineResults + timelinePagination + timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination) + for i in allTimelineResults: content = self.cleanhtml(i.content) sentiment = self.sentiTooter.analyze(i) toots.append( { "sentiment": sentiment[0], - "compound": sentiment[1], + "model": sentiment[1], "userName": i.account.display_name, "userId": i.account.id, "toot": content, "datetime": i.created_at.astimezone(self.localTimezone), - "language": i.language, + "language": detect(content), "tootId": i.id } ) diff --git a/requirements.txt b/requirements.txt index 4880a8c..842b542 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,6 @@ matplotlib pandas sqlalchemy vader-multi -numpy \ No newline at end of file +numpy +pytz +transformers \ No newline at end of file