Now using language dependent senti analizier. no compound score anymore.

This commit is contained in:
rnsrk 2023-01-05 01:43:11 +01:00
parent f0d4eadf28
commit a20f7331bb
8 changed files with 153 additions and 72 deletions

4
.gitignore vendored
View file

@ -5,3 +5,7 @@ __pycache__
hedonodon_clientcred.secret hedonodon_clientcred.secret
hedonodon_usercred.secret hedonodon_usercred.secret
.fleet .fleet
test.py
.idea
cardiffnlp
venv

View file

@ -3,6 +3,35 @@ import pandas as pd
from sqlalchemy import desc, select from sqlalchemy import desc, select
from Tables import Toots from Tables import Toots
def calculateSentimentCount():
query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount
FROM Toots
GROUP BY DATE(datetime),
sentiment
HAVING datetime >= DATE("now","-1 day")
AND datetime < DATE("now")'''
return pd.read_sql(
query,
databaseUrl,
parse_dates=["datetime"]
)
def calculateSentimentMean(dataframe):
negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1
positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum()
sentimentSum = dataframe['sentimentCount'].sum()
sentimentMean = (negativeSentimentSum + positiveSentimentSum) / sentimentSum
sentimentDate = dataframe.loc[0]['date']
return pd.DataFrame.from_records(
[
{
'date': sentimentDate,
'sentimentsMean': sentimentMean
}
]
)
class CRUDManager(): class CRUDManager():
def saveToDatabase(self, dataframe, table:str, useIndex=False): def saveToDatabase(self, dataframe, table:str, useIndex=False):
@ -17,20 +46,3 @@ class CRUDManager():
def getLastToot(self): def getLastToot(self):
stmt = select(Toots.tootId).order_by(desc('datetime')) stmt = select(Toots.tootId).order_by(desc('datetime'))
return session.scalars(stmt).first() return session.scalars(stmt).first()
def calculateAggregates(self, column, aggregate='Count'):
if (aggregate=='Count'):
addGroup = f', {column} '
else:
addGroup = ''
query = f'''SELECT DATE(datetime) as date {addGroup}, {aggregate}({column}) as {column}{aggregate}
FROM Toots
GROUP BY DATE(datetime)''' \
+ addGroup \
+ '''HAVING datetime >= DATE("now","-1 day")
AND datetime < DATE("now")'''
return pd.read_sql(
query,
databaseUrl,
parse_dates=["datetime"]
)

52
Main.py
View file

@ -1,12 +1,10 @@
from CRUDManager import CRUDManager from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean
from datetime import datetime, date from datetime import datetime, date
from DbSetup import init_db from DbSetup import init_db
import locale import locale
from MastodonAccountManager import MastodonAccountManager from MastodonAccountManager import MastodonAccountManager
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.dates as mdates import matplotlib.dates as mdates
from matplotlib.ticker import MultipleLocator
import numpy as np
from TootCrawler import TootCrawler from TootCrawler import TootCrawler
locale.setlocale(locale.LC_TIME, "en_EN.UTF-8") locale.setlocale(locale.LC_TIME, "en_EN.UTF-8")
@ -27,31 +25,38 @@ crudManager = CRUDManager()
lastTootId = crudManager.getLastToot() lastTootId = crudManager.getLastToot()
tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId) tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId)
sentimentsYesterday = crudManager.calculateAggregates('sentiment', 'Count')
if not tootsDataframe.empty:
crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False)
else:
print('Nothing changed since last database insert!')
sentimentsYesterday = calculateSentimentCount()
sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday)
if not tootsDataframe.empty:
crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True)
crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True)
else:
print('Nothing changed since last database insert!')
colormap = { colormap = {
'negative"': '#ff9999', 'negative': '#ff9999',
'neutral': '#ffcc99', 'neutral': '#ffcc99',
"positive": '#99ff99' "positive": '#99ff99'
} }
todaysColors = [] todaysColors = []
for sentiment in sentimentsYesterday['sentiment'].to_numpy(): for sentiment in sentimentsYesterday['sentiment'].to_numpy():
todaysColors.append(colormap[sentiment]) todaysColors.append(colormap[sentiment])
compoundsYesterday = crudManager.calculateAggregates('compound', 'Avg')
if not tootsDataframe.empty:
crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False)
crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='Sentiments', useIndex=True)
crudManager.saveToDatabase(dataframe=compoundsYesterday, table='Compounds', useIndex=True)
else:
print('Nothing changed since last database insert!')
TodayDate= datetime.strptime(sentimentsYesterday['date'][0], '%Y-%m-%d').strftime('%d.%m.%Y')
TodayDate = datetime.strptime(sentimentsYesterday['date'][0], '%Y-%m-%d').strftime('%d.%m.%Y')
dataframe4PieChart = sentimentsYesterday.drop('date', axis=1).set_index('sentiment') dataframe4PieChart = sentimentsYesterday.drop('date', axis=1).set_index('sentiment')
dataframe4LineChart = crudManager.loadFromDatabase('Compounds', 'date').drop('index', axis=1) dataframe4LineChart = crudManager.loadFromDatabase('SentimentMeans', 'date').drop('index', axis=1)
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10,10)) fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10))
# Pie chart. # Pie chart.
pieChartlabels = dataframe4PieChart.index.to_numpy() pieChartlabels = dataframe4PieChart.index.to_numpy()
@ -61,24 +66,22 @@ pieChart = dataframe4PieChart.plot.pie(
ylabel="", ylabel="",
labels=dataframe4PieChart['sentimentCount'], labels=dataframe4PieChart['sentimentCount'],
title=f'Moods of the toots on {TodayDate} of the local timeline on fedihum.org', title=f'Moods of the toots on {TodayDate} of the local timeline on fedihum.org',
colors = todaysColors, colors=todaysColors,
wedgeprops=dict(linewidth=3, edgecolor='w'), wedgeprops=dict(linewidth=3, edgecolor='w'),
startangle=90 startangle=90
) )
axes[0].axis('equal') axes[0].axis('equal')
centre_circle = plt.Circle((0,0),0.6,fc='white') centre_circle = plt.Circle((0, 0), 0.6, fc='white')
axes[0].add_patch(centre_circle) axes[0].add_patch(centre_circle)
chartBox = axes[0].get_position() chartBox = axes[0].get_position()
axes[0].set_position([chartBox.x0,chartBox.y0-0.2,chartBox.width,chartBox.height]) axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9))
axes[0].legend(pieChartlabels,loc='upper right', bbox_to_anchor=(0.8, 0.9))
# Line chart. # Line chart.
lineChart = dataframe4LineChart.plot.line( lineChart = dataframe4LineChart.plot.line(
ax=axes[1], ax=axes[1],
title='Compounds from max positive (1) to min negative (-1)' title='Mean of all sentiments from max positive (1) to min negative (-1)'
) )
axes[1].grid(True) axes[1].grid(True)
axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)]) axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)])
axes[1].set_ylim([-1, 1]) axes[1].set_ylim([-1, 1])
@ -88,8 +91,9 @@ axes[1].xaxis.set_major_formatter(plt.NullFormatter())
axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h')) axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h'))
axes[1].tick_params(which='minor', length=0) axes[1].tick_params(which='minor', length=0)
plotFileUrl = f'./plots/{TodayDate}.png' plotFileUrl = f'./plots/{TodayDate}.png'
plt.show()
plt.savefig(plotFileUrl) plt.savefig(plotFileUrl)
"""
media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the compounds up to {TodayDate}.") media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the compounds up to {TodayDate}.")
mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en') mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en')
"""

View file

@ -2,4 +2,4 @@ from mastodon import Mastodon
class MastodonAccountManager(): class MastodonAccountManager():
def __init__(self): def __init__(self):
self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret') self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')

View file

@ -1,19 +1,74 @@
from math import sqrt from germansentiment import SentimentModel
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np import numpy as np
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
class SentiTooter():
# Preprocess text (username and link placeholders)
def preprocess(text):
new_text = []
for t in text.split(" "):
t = '@user' if t.startswith('@') and len(t) > 1 else t
t = 'http' if t.startswith('http') else t
new_text.append(t)
return " ".join(new_text)
class SentiTooter:
"""""" """"""
def __init__(self): def __init__(self):
self.deModel = SentimentModel()
self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
self.enModel, self.enTokenizer = self.initModel()
# https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt
self.labels = ['negative', 'neutral', 'positive']
self.sia = SentimentIntensityAnalyzer() self.sia = SentimentIntensityAnalyzer()
def analyze(self, toot): def analyze(self, toot):
compound = self.sia.polarity_scores(toot.content)['compound'] match toot.language:
if (compound > (1/3)): case 'de':
return ['positive', compound] sentiment = self.deModel.predict_sentiment([toot.content])
elif (compound < (-1/3)): sentiment.append('germanSentiment')
return ['negative', compound] return sentiment
else: case 'en':
return ['neutral', compound] text = preprocess(toot.content)
encoded_input = self.enTokenizer(text, return_tensors='pt')
output = self.enModel(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
sentimentIndexWithMaxScore = np.argmax(scores)
sentimentLabel = self.labels[sentimentIndexWithMaxScore]
sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment']
return sentiment
case _:
compound = self.sia.polarity_scores(toot.content)['compound']
if compound > (1 / 3):
return ['positive', 'vaderSentiment']
elif compound < (-1 / 3):
return ['negative', 'vaderSentiment']
else:
return ['neutral', 'vaderSentiment']
def initModel(self):
# PT
tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
tokenizer.save_pretrained(self.enModelType)
model = AutoModelForSequenceClassification.from_pretrained(self.enModelType)
model.save_pretrained(self.enModelType)
return model, tokenizer
# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
# text = "Good night 😊"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)

View file

@ -5,7 +5,7 @@ class Toots(Base):
__tablename__ = 'Toots' __tablename__ = 'Toots'
__table_args__ = {'extend_existing': True} __table_args__ = {'extend_existing': True}
index = Column(Integer, primary_key=True) index = Column(Integer, primary_key=True)
compound = Column(Float) model = Column(String(30))
datetime = Column(Date) datetime = Column(Date)
language = Column(String(3)) language = Column(String(3))
sentiment = Column(String(8)) sentiment = Column(String(8))
@ -16,18 +16,17 @@ class Toots(Base):
class Sentiments(Base): class SentimentCounts(Base):
__tablename__ = 'Sentiments' __tablename__ = 'SentimentCounts'
__table_args__ = {'extend_existing': True} __table_args__ = {'extend_existing': True}
index = Column(Integer, primary_key=True) index = Column(Integer, primary_key=True)
sentimentCount = Column(Integer) sentimentCount = Column(Integer)
date = Column(Date, primary_key = True) date = Column(Date, primary_key=True)
sentiment = Column(String(8)) sentiment = Column(String(8))
class SentimentMeans(Base):
class Compounds(Base): __tablename__ = 'SentimentMeans'
__tablename__ = 'Compounds'
__table_args__ = {'extend_existing': True} __table_args__ = {'extend_existing': True}
index = Column(Integer, primary_key=True) index = Column(Integer, primary_key=True)
date = Column(Date, primary_key = True) date = Column(Date, primary_key=True)
compoundAvg = Column(Float) SentimentsMean = Column(Float)

View file

@ -1,10 +1,10 @@
from langdetect import detect
import pytz import pytz
import pandas as pd import pandas as pd
import re import re
from SentiTooter import SentiTooter from SentiTooter import SentiTooter
from pprint import pprint from pprint import pprint
class TootCrawler(): class TootCrawler():
def __init__(self, mastodonInstance) -> None: def __init__(self, mastodonInstance) -> None:
@ -13,29 +13,34 @@ class TootCrawler():
self.sentiTooter = SentiTooter() self.sentiTooter = SentiTooter()
self.localTimezone = pytz.timezone('Europe/Berlin') self.localTimezone = pytz.timezone('Europe/Berlin')
def getLocalTimeline(self, sinceId=None): def getLocalTimeline(self, minId=None):
return self.mastodonInstance.timeline_local(since_id=sinceId) return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
def cleanhtml(self, raw_html): def cleanhtml(self, raw_html):
cleantext = re.sub(self.compilePattern, '', raw_html) cleantext = re.sub(self.compilePattern, '', raw_html)
cleantext = re.sub(r'http\S+', '', cleantext) cleantext = re.sub(r'http\S+', '', cleantext)
return cleantext return cleantext
def buildTootsDataframe(self, sinceId=None): def buildTootsDataframe(self, minId=None):
toots = [] toots = []
allTimelineResults = []
timelinePagination = self.getLocalTimeline(minId)
for i in self.getLocalTimeline(sinceId): while timelinePagination:
allTimelineResults = allTimelineResults + timelinePagination
timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination)
for i in allTimelineResults:
content = self.cleanhtml(i.content) content = self.cleanhtml(i.content)
sentiment = self.sentiTooter.analyze(i) sentiment = self.sentiTooter.analyze(i)
toots.append( toots.append(
{ {
"sentiment": sentiment[0], "sentiment": sentiment[0],
"compound": sentiment[1], "model": sentiment[1],
"userName": i.account.display_name, "userName": i.account.display_name,
"userId": i.account.id, "userId": i.account.id,
"toot": content, "toot": content,
"datetime": i.created_at.astimezone(self.localTimezone), "datetime": i.created_at.astimezone(self.localTimezone),
"language": i.language, "language": detect(content),
"tootId": i.id "tootId": i.id
} }
) )

View file

@ -4,3 +4,5 @@ pandas
sqlalchemy sqlalchemy
vader-multi vader-multi
numpy numpy
pytz
transformers