Now using language dependent senti analizier. no compound score anymore.

This commit is contained in:
rnsrk 2023-01-05 01:43:11 +01:00
parent f0d4eadf28
commit a20f7331bb
8 changed files with 153 additions and 72 deletions

6
.gitignore vendored
View file

@ -4,4 +4,8 @@ instance
__pycache__
hedonodon_clientcred.secret
hedonodon_usercred.secret
.fleet
.fleet
test.py
.idea
cardiffnlp
venv

View file

@ -3,6 +3,35 @@ import pandas as pd
from sqlalchemy import desc, select
from Tables import Toots
def calculateSentimentCount():
query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount
FROM Toots
GROUP BY DATE(datetime),
sentiment
HAVING datetime >= DATE("now","-1 day")
AND datetime < DATE("now")'''
return pd.read_sql(
query,
databaseUrl,
parse_dates=["datetime"]
)
def calculateSentimentMean(dataframe):
negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1
positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum()
sentimentSum = dataframe['sentimentCount'].sum()
sentimentMean = (negativeSentimentSum + positiveSentimentSum) / sentimentSum
sentimentDate = dataframe.loc[0]['date']
return pd.DataFrame.from_records(
[
{
'date': sentimentDate,
'sentimentsMean': sentimentMean
}
]
)
class CRUDManager():
def saveToDatabase(self, dataframe, table:str, useIndex=False):
@ -16,21 +45,4 @@ class CRUDManager():
def getLastToot(self):
stmt = select(Toots.tootId).order_by(desc('datetime'))
return session.scalars(stmt).first()
def calculateAggregates(self, column, aggregate='Count'):
if (aggregate=='Count'):
addGroup = f', {column} '
else:
addGroup = ''
query = f'''SELECT DATE(datetime) as date {addGroup}, {aggregate}({column}) as {column}{aggregate}
FROM Toots
GROUP BY DATE(datetime)''' \
+ addGroup \
+ '''HAVING datetime >= DATE("now","-1 day")
AND datetime < DATE("now")'''
return pd.read_sql(
query,
databaseUrl,
parse_dates=["datetime"]
)
return session.scalars(stmt).first()

52
Main.py
View file

@ -1,12 +1,10 @@
from CRUDManager import CRUDManager
from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean
from datetime import datetime, date
from DbSetup import init_db
import locale
from MastodonAccountManager import MastodonAccountManager
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import MultipleLocator
import numpy as np
from TootCrawler import TootCrawler
locale.setlocale(locale.LC_TIME, "en_EN.UTF-8")
@ -27,31 +25,38 @@ crudManager = CRUDManager()
lastTootId = crudManager.getLastToot()
tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId)
sentimentsYesterday = crudManager.calculateAggregates('sentiment', 'Count')
if not tootsDataframe.empty:
crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False)
else:
print('Nothing changed since last database insert!')
sentimentsYesterday = calculateSentimentCount()
sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday)
if not tootsDataframe.empty:
crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True)
crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True)
else:
print('Nothing changed since last database insert!')
colormap = {
'negative"': '#ff9999',
'negative': '#ff9999',
'neutral': '#ffcc99',
"positive": '#99ff99'
}
todaysColors = []
for sentiment in sentimentsYesterday['sentiment'].to_numpy():
todaysColors.append(colormap[sentiment])
todaysColors.append(colormap[sentiment])
compoundsYesterday = crudManager.calculateAggregates('compound', 'Avg')
if not tootsDataframe.empty:
crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False)
crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='Sentiments', useIndex=True)
crudManager.saveToDatabase(dataframe=compoundsYesterday, table='Compounds', useIndex=True)
else:
print('Nothing changed since last database insert!')
TodayDate= datetime.strptime(sentimentsYesterday['date'][0], '%Y-%m-%d').strftime('%d.%m.%Y')
TodayDate = datetime.strptime(sentimentsYesterday['date'][0], '%Y-%m-%d').strftime('%d.%m.%Y')
dataframe4PieChart = sentimentsYesterday.drop('date', axis=1).set_index('sentiment')
dataframe4LineChart = crudManager.loadFromDatabase('Compounds', 'date').drop('index', axis=1)
dataframe4LineChart = crudManager.loadFromDatabase('SentimentMeans', 'date').drop('index', axis=1)
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10,10))
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10))
# Pie chart.
pieChartlabels = dataframe4PieChart.index.to_numpy()
@ -61,24 +66,22 @@ pieChart = dataframe4PieChart.plot.pie(
ylabel="",
labels=dataframe4PieChart['sentimentCount'],
title=f'Moods of the toots on {TodayDate} of the local timeline on fedihum.org',
colors = todaysColors,
colors=todaysColors,
wedgeprops=dict(linewidth=3, edgecolor='w'),
startangle=90
)
axes[0].axis('equal')
centre_circle = plt.Circle((0,0),0.6,fc='white')
centre_circle = plt.Circle((0, 0), 0.6, fc='white')
axes[0].add_patch(centre_circle)
chartBox = axes[0].get_position()
axes[0].set_position([chartBox.x0,chartBox.y0-0.2,chartBox.width,chartBox.height])
axes[0].legend(pieChartlabels,loc='upper right', bbox_to_anchor=(0.8, 0.9))
axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9))
# Line chart.
lineChart = dataframe4LineChart.plot.line(
ax=axes[1],
title='Compounds from max positive (1) to min negative (-1)'
)
title='Mean of all sentiments from max positive (1) to min negative (-1)'
)
axes[1].grid(True)
axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)])
axes[1].set_ylim([-1, 1])
@ -88,8 +91,9 @@ axes[1].xaxis.set_major_formatter(plt.NullFormatter())
axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h'))
axes[1].tick_params(which='minor', length=0)
plotFileUrl = f'./plots/{TodayDate}.png'
plt.show()
plt.savefig(plotFileUrl)
"""
media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the compounds up to {TodayDate}.")
mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en')
"""

View file

@ -2,4 +2,4 @@ from mastodon import Mastodon
class MastodonAccountManager():
def __init__(self):
self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')
self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')

View file

@ -1,19 +1,74 @@
from math import sqrt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from germansentiment import SentimentModel
import numpy as np
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
class SentiTooter():
# Preprocess text (username and link placeholders)
def preprocess(text):
new_text = []
for t in text.split(" "):
t = '@user' if t.startswith('@') and len(t) > 1 else t
t = 'http' if t.startswith('http') else t
new_text.append(t)
return " ".join(new_text)
class SentiTooter:
""""""
def __init__(self):
self.deModel = SentimentModel()
self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
self.enModel, self.enTokenizer = self.initModel()
# https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt
self.labels = ['negative', 'neutral', 'positive']
self.sia = SentimentIntensityAnalyzer()
def analyze(self, toot):
compound = self.sia.polarity_scores(toot.content)['compound']
if (compound > (1/3)):
return ['positive', compound]
elif (compound < (-1/3)):
return ['negative', compound]
else:
return ['neutral', compound]
match toot.language:
case 'de':
sentiment = self.deModel.predict_sentiment([toot.content])
sentiment.append('germanSentiment')
return sentiment
case 'en':
text = preprocess(toot.content)
encoded_input = self.enTokenizer(text, return_tensors='pt')
output = self.enModel(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
sentimentIndexWithMaxScore = np.argmax(scores)
sentimentLabel = self.labels[sentimentIndexWithMaxScore]
sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment']
return sentiment
case _:
compound = self.sia.polarity_scores(toot.content)['compound']
if compound > (1 / 3):
return ['positive', 'vaderSentiment']
elif compound < (-1 / 3):
return ['negative', 'vaderSentiment']
else:
return ['neutral', 'vaderSentiment']
def initModel(self):
# PT
tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
tokenizer.save_pretrained(self.enModelType)
model = AutoModelForSequenceClassification.from_pretrained(self.enModelType)
model.save_pretrained(self.enModelType)
return model, tokenizer
# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
# text = "Good night 😊"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)

View file

@ -5,7 +5,7 @@ class Toots(Base):
__tablename__ = 'Toots'
__table_args__ = {'extend_existing': True}
index = Column(Integer, primary_key=True)
compound = Column(Float)
model = Column(String(30))
datetime = Column(Date)
language = Column(String(3))
sentiment = Column(String(8))
@ -16,18 +16,17 @@ class Toots(Base):
class Sentiments(Base):
__tablename__ = 'Sentiments'
class SentimentCounts(Base):
__tablename__ = 'SentimentCounts'
__table_args__ = {'extend_existing': True}
index = Column(Integer, primary_key=True)
sentimentCount = Column(Integer)
date = Column(Date, primary_key = True)
date = Column(Date, primary_key=True)
sentiment = Column(String(8))
class Compounds(Base):
__tablename__ = 'Compounds'
class SentimentMeans(Base):
__tablename__ = 'SentimentMeans'
__table_args__ = {'extend_existing': True}
index = Column(Integer, primary_key=True)
date = Column(Date, primary_key = True)
compoundAvg = Column(Float)
date = Column(Date, primary_key=True)
SentimentsMean = Column(Float)

View file

@ -1,10 +1,10 @@
from langdetect import detect
import pytz
import pandas as pd
import re
from SentiTooter import SentiTooter
from pprint import pprint
class TootCrawler():
def __init__(self, mastodonInstance) -> None:
@ -13,29 +13,34 @@ class TootCrawler():
self.sentiTooter = SentiTooter()
self.localTimezone = pytz.timezone('Europe/Berlin')
def getLocalTimeline(self, sinceId=None):
return self.mastodonInstance.timeline_local(since_id=sinceId)
def getLocalTimeline(self, minId=None):
return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
def cleanhtml(self, raw_html):
cleantext = re.sub(self.compilePattern, '', raw_html)
cleantext = re.sub(r'http\S+', '', cleantext)
return cleantext
def buildTootsDataframe(self, sinceId=None):
def buildTootsDataframe(self, minId=None):
toots = []
allTimelineResults = []
timelinePagination = self.getLocalTimeline(minId)
for i in self.getLocalTimeline(sinceId):
while timelinePagination:
allTimelineResults = allTimelineResults + timelinePagination
timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination)
for i in allTimelineResults:
content = self.cleanhtml(i.content)
sentiment = self.sentiTooter.analyze(i)
toots.append(
{
"sentiment": sentiment[0],
"compound": sentiment[1],
"model": sentiment[1],
"userName": i.account.display_name,
"userId": i.account.id,
"toot": content,
"datetime": i.created_at.astimezone(self.localTimezone),
"language": i.language,
"language": detect(content),
"tootId": i.id
}
)

View file

@ -3,4 +3,6 @@ matplotlib
pandas
sqlalchemy
vader-multi
numpy
numpy
pytz
transformers