Now using language dependent senti analizier. no compound score anymore.
This commit is contained in:
parent
f0d4eadf28
commit
a20f7331bb
8 changed files with 153 additions and 72 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -5,3 +5,7 @@ __pycache__
|
||||||
hedonodon_clientcred.secret
|
hedonodon_clientcred.secret
|
||||||
hedonodon_usercred.secret
|
hedonodon_usercred.secret
|
||||||
.fleet
|
.fleet
|
||||||
|
test.py
|
||||||
|
.idea
|
||||||
|
cardiffnlp
|
||||||
|
venv
|
||||||
|
|
@ -3,6 +3,35 @@ import pandas as pd
|
||||||
from sqlalchemy import desc, select
|
from sqlalchemy import desc, select
|
||||||
from Tables import Toots
|
from Tables import Toots
|
||||||
|
|
||||||
|
|
||||||
|
def calculateSentimentCount():
|
||||||
|
query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount
|
||||||
|
FROM Toots
|
||||||
|
GROUP BY DATE(datetime),
|
||||||
|
sentiment
|
||||||
|
HAVING datetime >= DATE("now","-1 day")
|
||||||
|
AND datetime < DATE("now")'''
|
||||||
|
return pd.read_sql(
|
||||||
|
query,
|
||||||
|
databaseUrl,
|
||||||
|
parse_dates=["datetime"]
|
||||||
|
)
|
||||||
|
|
||||||
|
def calculateSentimentMean(dataframe):
|
||||||
|
negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1
|
||||||
|
positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum()
|
||||||
|
sentimentSum = dataframe['sentimentCount'].sum()
|
||||||
|
sentimentMean = (negativeSentimentSum + positiveSentimentSum) / sentimentSum
|
||||||
|
sentimentDate = dataframe.loc[0]['date']
|
||||||
|
return pd.DataFrame.from_records(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
'date': sentimentDate,
|
||||||
|
'sentimentsMean': sentimentMean
|
||||||
|
}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
class CRUDManager():
|
class CRUDManager():
|
||||||
|
|
||||||
def saveToDatabase(self, dataframe, table:str, useIndex=False):
|
def saveToDatabase(self, dataframe, table:str, useIndex=False):
|
||||||
|
|
@ -17,20 +46,3 @@ class CRUDManager():
|
||||||
def getLastToot(self):
|
def getLastToot(self):
|
||||||
stmt = select(Toots.tootId).order_by(desc('datetime'))
|
stmt = select(Toots.tootId).order_by(desc('datetime'))
|
||||||
return session.scalars(stmt).first()
|
return session.scalars(stmt).first()
|
||||||
|
|
||||||
def calculateAggregates(self, column, aggregate='Count'):
|
|
||||||
if (aggregate=='Count'):
|
|
||||||
addGroup = f', {column} '
|
|
||||||
else:
|
|
||||||
addGroup = ''
|
|
||||||
query = f'''SELECT DATE(datetime) as date {addGroup}, {aggregate}({column}) as {column}{aggregate}
|
|
||||||
FROM Toots
|
|
||||||
GROUP BY DATE(datetime)''' \
|
|
||||||
+ addGroup \
|
|
||||||
+ '''HAVING datetime >= DATE("now","-1 day")
|
|
||||||
AND datetime < DATE("now")'''
|
|
||||||
return pd.read_sql(
|
|
||||||
query,
|
|
||||||
databaseUrl,
|
|
||||||
parse_dates=["datetime"]
|
|
||||||
)
|
|
||||||
|
|
|
||||||
50
Main.py
50
Main.py
|
|
@ -1,12 +1,10 @@
|
||||||
from CRUDManager import CRUDManager
|
from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
from DbSetup import init_db
|
from DbSetup import init_db
|
||||||
import locale
|
import locale
|
||||||
from MastodonAccountManager import MastodonAccountManager
|
from MastodonAccountManager import MastodonAccountManager
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import matplotlib.dates as mdates
|
import matplotlib.dates as mdates
|
||||||
from matplotlib.ticker import MultipleLocator
|
|
||||||
import numpy as np
|
|
||||||
from TootCrawler import TootCrawler
|
from TootCrawler import TootCrawler
|
||||||
|
|
||||||
locale.setlocale(locale.LC_TIME, "en_EN.UTF-8")
|
locale.setlocale(locale.LC_TIME, "en_EN.UTF-8")
|
||||||
|
|
@ -27,10 +25,23 @@ crudManager = CRUDManager()
|
||||||
|
|
||||||
lastTootId = crudManager.getLastToot()
|
lastTootId = crudManager.getLastToot()
|
||||||
tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId)
|
tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId)
|
||||||
sentimentsYesterday = crudManager.calculateAggregates('sentiment', 'Count')
|
|
||||||
|
if not tootsDataframe.empty:
|
||||||
|
crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False)
|
||||||
|
else:
|
||||||
|
print('Nothing changed since last database insert!')
|
||||||
|
|
||||||
|
sentimentsYesterday = calculateSentimentCount()
|
||||||
|
sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday)
|
||||||
|
|
||||||
|
if not tootsDataframe.empty:
|
||||||
|
crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True)
|
||||||
|
crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True)
|
||||||
|
else:
|
||||||
|
print('Nothing changed since last database insert!')
|
||||||
|
|
||||||
colormap = {
|
colormap = {
|
||||||
'negative"': '#ff9999',
|
'negative': '#ff9999',
|
||||||
'neutral': '#ffcc99',
|
'neutral': '#ffcc99',
|
||||||
"positive": '#99ff99'
|
"positive": '#99ff99'
|
||||||
}
|
}
|
||||||
|
|
@ -39,19 +50,13 @@ todaysColors = []
|
||||||
for sentiment in sentimentsYesterday['sentiment'].to_numpy():
|
for sentiment in sentimentsYesterday['sentiment'].to_numpy():
|
||||||
todaysColors.append(colormap[sentiment])
|
todaysColors.append(colormap[sentiment])
|
||||||
|
|
||||||
compoundsYesterday = crudManager.calculateAggregates('compound', 'Avg')
|
|
||||||
if not tootsDataframe.empty:
|
|
||||||
crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False)
|
|
||||||
crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='Sentiments', useIndex=True)
|
|
||||||
crudManager.saveToDatabase(dataframe=compoundsYesterday, table='Compounds', useIndex=True)
|
|
||||||
else:
|
|
||||||
print('Nothing changed since last database insert!')
|
|
||||||
|
|
||||||
TodayDate= datetime.strptime(sentimentsYesterday['date'][0], '%Y-%m-%d').strftime('%d.%m.%Y')
|
|
||||||
|
TodayDate = datetime.strptime(sentimentsYesterday['date'][0], '%Y-%m-%d').strftime('%d.%m.%Y')
|
||||||
dataframe4PieChart = sentimentsYesterday.drop('date', axis=1).set_index('sentiment')
|
dataframe4PieChart = sentimentsYesterday.drop('date', axis=1).set_index('sentiment')
|
||||||
dataframe4LineChart = crudManager.loadFromDatabase('Compounds', 'date').drop('index', axis=1)
|
dataframe4LineChart = crudManager.loadFromDatabase('SentimentMeans', 'date').drop('index', axis=1)
|
||||||
|
|
||||||
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10,10))
|
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10))
|
||||||
|
|
||||||
# Pie chart.
|
# Pie chart.
|
||||||
pieChartlabels = dataframe4PieChart.index.to_numpy()
|
pieChartlabels = dataframe4PieChart.index.to_numpy()
|
||||||
|
|
@ -61,24 +66,22 @@ pieChart = dataframe4PieChart.plot.pie(
|
||||||
ylabel="",
|
ylabel="",
|
||||||
labels=dataframe4PieChart['sentimentCount'],
|
labels=dataframe4PieChart['sentimentCount'],
|
||||||
title=f'Moods of the toots on {TodayDate} of the local timeline on fedihum.org',
|
title=f'Moods of the toots on {TodayDate} of the local timeline on fedihum.org',
|
||||||
colors = todaysColors,
|
colors=todaysColors,
|
||||||
wedgeprops=dict(linewidth=3, edgecolor='w'),
|
wedgeprops=dict(linewidth=3, edgecolor='w'),
|
||||||
startangle=90
|
startangle=90
|
||||||
)
|
)
|
||||||
|
|
||||||
axes[0].axis('equal')
|
axes[0].axis('equal')
|
||||||
centre_circle = plt.Circle((0,0),0.6,fc='white')
|
centre_circle = plt.Circle((0, 0), 0.6, fc='white')
|
||||||
axes[0].add_patch(centre_circle)
|
axes[0].add_patch(centre_circle)
|
||||||
chartBox = axes[0].get_position()
|
chartBox = axes[0].get_position()
|
||||||
axes[0].set_position([chartBox.x0,chartBox.y0-0.2,chartBox.width,chartBox.height])
|
axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9))
|
||||||
axes[0].legend(pieChartlabels,loc='upper right', bbox_to_anchor=(0.8, 0.9))
|
|
||||||
|
|
||||||
|
|
||||||
# Line chart.
|
# Line chart.
|
||||||
lineChart = dataframe4LineChart.plot.line(
|
lineChart = dataframe4LineChart.plot.line(
|
||||||
ax=axes[1],
|
ax=axes[1],
|
||||||
title='Compounds from max positive (1) to min negative (-1)'
|
title='Mean of all sentiments from max positive (1) to min negative (-1)'
|
||||||
)
|
)
|
||||||
axes[1].grid(True)
|
axes[1].grid(True)
|
||||||
axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)])
|
axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)])
|
||||||
axes[1].set_ylim([-1, 1])
|
axes[1].set_ylim([-1, 1])
|
||||||
|
|
@ -88,8 +91,9 @@ axes[1].xaxis.set_major_formatter(plt.NullFormatter())
|
||||||
axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h'))
|
axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h'))
|
||||||
axes[1].tick_params(which='minor', length=0)
|
axes[1].tick_params(which='minor', length=0)
|
||||||
plotFileUrl = f'./plots/{TodayDate}.png'
|
plotFileUrl = f'./plots/{TodayDate}.png'
|
||||||
plt.show()
|
|
||||||
plt.savefig(plotFileUrl)
|
plt.savefig(plotFileUrl)
|
||||||
|
|
||||||
|
"""
|
||||||
media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the compounds up to {TodayDate}.")
|
media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the compounds up to {TodayDate}.")
|
||||||
mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en')
|
mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en')
|
||||||
|
"""
|
||||||
|
|
|
||||||
|
|
@ -1,19 +1,74 @@
|
||||||
from math import sqrt
|
from germansentiment import SentimentModel
|
||||||
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from scipy.special import softmax
|
||||||
|
from transformers import AutoModelForSequenceClassification
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||||||
|
|
||||||
class SentiTooter():
|
|
||||||
|
# Preprocess text (username and link placeholders)
|
||||||
|
def preprocess(text):
|
||||||
|
new_text = []
|
||||||
|
|
||||||
|
for t in text.split(" "):
|
||||||
|
t = '@user' if t.startswith('@') and len(t) > 1 else t
|
||||||
|
t = 'http' if t.startswith('http') else t
|
||||||
|
new_text.append(t)
|
||||||
|
return " ".join(new_text)
|
||||||
|
|
||||||
|
|
||||||
|
class SentiTooter:
|
||||||
""""""
|
""""""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
self.deModel = SentimentModel()
|
||||||
|
self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
|
||||||
|
self.enModel, self.enTokenizer = self.initModel()
|
||||||
|
# https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt
|
||||||
|
self.labels = ['negative', 'neutral', 'positive']
|
||||||
self.sia = SentimentIntensityAnalyzer()
|
self.sia = SentimentIntensityAnalyzer()
|
||||||
|
|
||||||
|
|
||||||
def analyze(self, toot):
|
def analyze(self, toot):
|
||||||
|
match toot.language:
|
||||||
|
case 'de':
|
||||||
|
sentiment = self.deModel.predict_sentiment([toot.content])
|
||||||
|
sentiment.append('germanSentiment')
|
||||||
|
return sentiment
|
||||||
|
case 'en':
|
||||||
|
text = preprocess(toot.content)
|
||||||
|
encoded_input = self.enTokenizer(text, return_tensors='pt')
|
||||||
|
output = self.enModel(**encoded_input)
|
||||||
|
scores = output[0][0].detach().numpy()
|
||||||
|
scores = softmax(scores)
|
||||||
|
sentimentIndexWithMaxScore = np.argmax(scores)
|
||||||
|
sentimentLabel = self.labels[sentimentIndexWithMaxScore]
|
||||||
|
sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment']
|
||||||
|
return sentiment
|
||||||
|
case _:
|
||||||
compound = self.sia.polarity_scores(toot.content)['compound']
|
compound = self.sia.polarity_scores(toot.content)['compound']
|
||||||
if (compound > (1/3)):
|
if compound > (1 / 3):
|
||||||
return ['positive', compound]
|
return ['positive', 'vaderSentiment']
|
||||||
elif (compound < (-1/3)):
|
elif compound < (-1 / 3):
|
||||||
return ['negative', compound]
|
return ['negative', 'vaderSentiment']
|
||||||
else:
|
else:
|
||||||
return ['neutral', compound]
|
return ['neutral', 'vaderSentiment']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def initModel(self):
|
||||||
|
# PT
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
|
||||||
|
tokenizer.save_pretrained(self.enModelType)
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(self.enModelType)
|
||||||
|
model.save_pretrained(self.enModelType)
|
||||||
|
return model, tokenizer
|
||||||
|
|
||||||
|
# # TF
|
||||||
|
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
|
||||||
|
# model.save_pretrained(MODEL)
|
||||||
|
|
||||||
|
# text = "Good night 😊"
|
||||||
|
# encoded_input = tokenizer(text, return_tensors='tf')
|
||||||
|
# output = model(encoded_input)
|
||||||
|
# scores = output[0][0].numpy()
|
||||||
|
# scores = softmax(scores)
|
||||||
|
|
|
||||||
17
Tables.py
17
Tables.py
|
|
@ -5,7 +5,7 @@ class Toots(Base):
|
||||||
__tablename__ = 'Toots'
|
__tablename__ = 'Toots'
|
||||||
__table_args__ = {'extend_existing': True}
|
__table_args__ = {'extend_existing': True}
|
||||||
index = Column(Integer, primary_key=True)
|
index = Column(Integer, primary_key=True)
|
||||||
compound = Column(Float)
|
model = Column(String(30))
|
||||||
datetime = Column(Date)
|
datetime = Column(Date)
|
||||||
language = Column(String(3))
|
language = Column(String(3))
|
||||||
sentiment = Column(String(8))
|
sentiment = Column(String(8))
|
||||||
|
|
@ -16,18 +16,17 @@ class Toots(Base):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Sentiments(Base):
|
class SentimentCounts(Base):
|
||||||
__tablename__ = 'Sentiments'
|
__tablename__ = 'SentimentCounts'
|
||||||
__table_args__ = {'extend_existing': True}
|
__table_args__ = {'extend_existing': True}
|
||||||
index = Column(Integer, primary_key=True)
|
index = Column(Integer, primary_key=True)
|
||||||
sentimentCount = Column(Integer)
|
sentimentCount = Column(Integer)
|
||||||
date = Column(Date, primary_key = True)
|
date = Column(Date, primary_key=True)
|
||||||
sentiment = Column(String(8))
|
sentiment = Column(String(8))
|
||||||
|
|
||||||
|
class SentimentMeans(Base):
|
||||||
class Compounds(Base):
|
__tablename__ = 'SentimentMeans'
|
||||||
__tablename__ = 'Compounds'
|
|
||||||
__table_args__ = {'extend_existing': True}
|
__table_args__ = {'extend_existing': True}
|
||||||
index = Column(Integer, primary_key=True)
|
index = Column(Integer, primary_key=True)
|
||||||
date = Column(Date, primary_key = True)
|
date = Column(Date, primary_key=True)
|
||||||
compoundAvg = Column(Float)
|
SentimentsMean = Column(Float)
|
||||||
|
|
@ -1,10 +1,10 @@
|
||||||
|
from langdetect import detect
|
||||||
import pytz
|
import pytz
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import re
|
import re
|
||||||
from SentiTooter import SentiTooter
|
from SentiTooter import SentiTooter
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
|
|
||||||
|
|
||||||
class TootCrawler():
|
class TootCrawler():
|
||||||
|
|
||||||
def __init__(self, mastodonInstance) -> None:
|
def __init__(self, mastodonInstance) -> None:
|
||||||
|
|
@ -13,29 +13,34 @@ class TootCrawler():
|
||||||
self.sentiTooter = SentiTooter()
|
self.sentiTooter = SentiTooter()
|
||||||
self.localTimezone = pytz.timezone('Europe/Berlin')
|
self.localTimezone = pytz.timezone('Europe/Berlin')
|
||||||
|
|
||||||
def getLocalTimeline(self, sinceId=None):
|
def getLocalTimeline(self, minId=None):
|
||||||
return self.mastodonInstance.timeline_local(since_id=sinceId)
|
return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
|
||||||
|
|
||||||
def cleanhtml(self, raw_html):
|
def cleanhtml(self, raw_html):
|
||||||
cleantext = re.sub(self.compilePattern, '', raw_html)
|
cleantext = re.sub(self.compilePattern, '', raw_html)
|
||||||
cleantext = re.sub(r'http\S+', '', cleantext)
|
cleantext = re.sub(r'http\S+', '', cleantext)
|
||||||
return cleantext
|
return cleantext
|
||||||
|
|
||||||
def buildTootsDataframe(self, sinceId=None):
|
def buildTootsDataframe(self, minId=None):
|
||||||
toots = []
|
toots = []
|
||||||
|
allTimelineResults = []
|
||||||
|
timelinePagination = self.getLocalTimeline(minId)
|
||||||
|
|
||||||
for i in self.getLocalTimeline(sinceId):
|
while timelinePagination:
|
||||||
|
allTimelineResults = allTimelineResults + timelinePagination
|
||||||
|
timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination)
|
||||||
|
for i in allTimelineResults:
|
||||||
content = self.cleanhtml(i.content)
|
content = self.cleanhtml(i.content)
|
||||||
sentiment = self.sentiTooter.analyze(i)
|
sentiment = self.sentiTooter.analyze(i)
|
||||||
toots.append(
|
toots.append(
|
||||||
{
|
{
|
||||||
"sentiment": sentiment[0],
|
"sentiment": sentiment[0],
|
||||||
"compound": sentiment[1],
|
"model": sentiment[1],
|
||||||
"userName": i.account.display_name,
|
"userName": i.account.display_name,
|
||||||
"userId": i.account.id,
|
"userId": i.account.id,
|
||||||
"toot": content,
|
"toot": content,
|
||||||
"datetime": i.created_at.astimezone(self.localTimezone),
|
"datetime": i.created_at.astimezone(self.localTimezone),
|
||||||
"language": i.language,
|
"language": detect(content),
|
||||||
"tootId": i.id
|
"tootId": i.id
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -4,3 +4,5 @@ pandas
|
||||||
sqlalchemy
|
sqlalchemy
|
||||||
vader-multi
|
vader-multi
|
||||||
numpy
|
numpy
|
||||||
|
pytz
|
||||||
|
transformers
|
||||||
Loading…
Add table
Add a link
Reference in a new issue