Compare commits

..

11 commits

Author SHA1 Message Date
rnsrk
03792f2120 Fixed some typos 2023-03-17 21:29:16 +01:00
rnsrk
cafda77e7f Updated the README 2023-03-17 21:26:14 +01:00
rnsrk
8d9a7fa603 take the large spacy model 2023-03-17 21:25:44 +01:00
rnsrk
bc842244c7 add code documentation 2023-03-17 20:06:01 +01:00
rnsrk
4479bd2429 implement word counts. 2023-03-15 16:02:47 +01:00
rnsrk
6a8caac29e implement rough wordcount 2023-03-15 14:27:07 +01:00
rnsrk
09fd313a89 Merge branch 'main' into with_cites 2023-03-15 13:25:41 +01:00
rnsrk
3b677e5713 underway to wordcount 2023-03-15 13:21:44 +01:00
Robert Nasarek
8f7c578087 shortend description 2023-03-15 11:16:35 +01:00
Robert Nasarek
79f54079f7 fixed unrecognisable lang bug 2023-01-31 17:51:06 +01:00
Robert Nasarek
2b98565444 made hedonodon server ready 2023-01-27 21:08:25 +01:00
10 changed files with 473 additions and 149 deletions

23
.gitignore vendored
View file

@ -1,11 +1,12 @@
database.db
plots
instance
__pycache__
hedonodon_clientcred.secret
hedonodon_usercred.secret
.fleet
test.py
.idea
cardiffnlp
venv
database.db
plots
instance
__pycache__
hedonodon_clientcred.secret
hedonodon_usercred.secret
.fleet
test.py
.idea
cardiffnlp
venv
logs.txt

View file

@ -1,10 +1,21 @@
from DbSetup import engine, session, databaseUrl
from DbSetup import connection, engine, session, databaseUrl
import pandas as pd
from sqlalchemy import desc, select
from sqlalchemy import desc, select, sql
from Tables import Toots
from pandas.core.api import (
DataFrame)
def calculateSentimentCount():
"""Calculates the frequencies of the sentiments.
Returns
-------
DataFrame
Containing date (YY-MM-DD), sentiment (positive, neutral, negative),
and sentimentCount.
"""
query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount
FROM Toots
GROUP BY DATE(datetime),
@ -12,12 +23,23 @@ def calculateSentimentCount():
HAVING datetime >= DATE("now","-1 day")
AND datetime < DATE("now")'''
return pd.read_sql(
query,
databaseUrl,
sql.text(query),
connection,
parse_dates=["datetime"]
)
def calculateSentimentMean(dataframe):
def calculateSentimentMean(dataframe:DataFrame) -> DataFrame:
"""Calculates the mean of the sentiments.
Parameters
-------
dataframe: DataFrame
Returns
-------
Dataframe
Containing date (YY-MM-DD), sentimentsMean.
"""
negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1
positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum()
sentimentSum = dataframe['sentimentCount'].sum()
@ -32,17 +54,68 @@ def calculateSentimentMean(dataframe):
]
)
class CRUDManager():
def getYesterdaysToots() -> DataFrame:
"""Fetches yesterdays toots from database.
def saveToDatabase(self, dataframe, table:str, useIndex=False):
Returns
-------
pd.Dataframe
Containing date (YY-MM-DD), language, sentiment, toot.
"""
query = f'''SELECT datetime as date, language, sentiment, toot
FROM Toots
WHERE datetime >= DATE("now","-1 day")
AND datetime < DATE("now")'''
return pd.read_sql(
sql.text(query),
connection,
parse_dates=["datetime"]
)
class CRUDManager():
"""Class for database operations"""
def saveToDatabase(self, dataframe:DataFrame, table:str, useIndex=False):
"""Saves dataframe to database.
Parameters
-------
dataframe: DataFrame
Input dataframe.
table: str
Table, where to save the data.
useIndex: boolean
Should the index of the dataframe be used as index for
the database table?
"""
try:
dataframe.to_sql(table, engine, index=useIndex, if_exists="append")
except:
print(f'Could not save data to {table}!')
def loadFromDatabase(self, table:str, indexColumn=None):
return pd.read_sql_table(table, databaseUrl, index_col=indexColumn)
def loadFromDatabase(self, table:str, indexColumn=None) -> DataFrame:
"""Load a table into a dataframe.
def getLastToot(self):
Parameters
-------
table: str
Table, where to save the data.
indexColumn: str | None
Should the index of the table be used as index for
the dataframe?
Returns
-------
DataFrame
"""
return pd.read_sql_table(table, connection, index_col=indexColumn)
def getLastToot(self) -> str:
"""Query the last toot id from database.
Results
-------
str
A toot id.
"""
stmt = select(Toots.tootId).order_by(desc('datetime'))
return session.scalars(stmt).first()
return session.scalars(stmt).first()

View file

@ -1,11 +1,18 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from sqlalchemy.ext.declarative import declarative_base
databaseUrl = 'sqlite:///database.db'
engine = create_engine(databaseUrl, future=True)
session = Session(engine)
Base = declarative_base()
def init_db():
Base.metadata.create_all(bind=engine)
"""Script to initialize the database.
Serves database url, engine, connection and session.
"""
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from sqlalchemy.ext.declarative import declarative_base
databaseUrl = 'sqlite:///database.db'
engine = create_engine(databaseUrl, future=True)
connection = engine.connect()
session = Session(engine)
Base = declarative_base()
def init_db():
"""Initialize the database.
"""
Base.metadata.create_all(bind=engine)

60
Main.py
View file

@ -1,4 +1,20 @@
from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean
"""
Hedonodon toot sentiment analyzer.
This programm fetches toots from the fedihum.org Mastodon instance, calculates
the frequencies of the sentiments (positive, neutral, negative) and the mean
from these nominal values (even this is not statistical correct (;-_-)!, but
not all analyzer return compounds).
It also calculates the word count of the nouns per sentiment.
It uses germansentiment for german toots, twitter-roberta-base-sentiment for
english toots, and vaderSentiment for other languages.
For the word counts I translate the toots to english with the GoogleTranslator
first.
"""
from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots
from datetime import datetime, date
from DbSetup import init_db
import locale
@ -6,10 +22,12 @@ from MastodonAccountManager import MastodonAccountManager
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from TootCrawler import TootCrawler
from SentiTooter import translateToots, createWordFrequenciesPerSentiment
locale.setlocale(locale.LC_TIME, "en_EN.UTF-8")
locale.setlocale(locale.LC_TIME, "en_US.UTF-8")
init_db()
print('Initialize Mastodon...')
mastodonAccountManager = MastodonAccountManager()
mastodonInstance = mastodonAccountManager.instance
"""
@ -19,27 +37,47 @@ mastodonInstance.log_in(
to_file = 'hedonodon_usercred.secret'
)
"""
print('done!')
print('Fetching recent toots...')
tootCrawler = TootCrawler(mastodonInstance)
crudManager = CRUDManager()
lastTootId = crudManager.getLastToot()
tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId)
exit()
print('done!')
print('Save toots to database...')
if not tootsDataframe.empty:
crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False)
print('done!')
else:
print('Nothing changed since last database insert!')
print('nothing changed since last database insert!')
print('Calculate word counts...')
yesterdaysToots = getYesterdaysToots()
translatedToots = translateToots(yesterdaysToots)
wordCountsPerSentiment = createWordFrequenciesPerSentiment(translatedToots)
print('done!')
print(wordCountsPerSentiment);
print('Calculate sentiment counts...')
sentimentsYesterday = calculateSentimentCount()
sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday)
print('done!')
print('Calculate sentiment mean...')
sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday)
print('done!')
print('Save calculations to database...')
if not tootsDataframe.empty:
crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True)
crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True)
print('done!')
else:
print('Nothing changed since last database insert!')
print('nothing changed since last database insert!')
print('Create figure...')
colormap = {
'negative': '#ff9999',
'neutral': '#ffcc99',
@ -80,7 +118,7 @@ axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9))
# Line chart.
lineChart = dataframe4LineChart.plot.line(
ax=axes[1],
title='Mean of all sentiments from max positive (1) to min negative (-1)'
title='"Mean" of sentiments, calculated from nominal values, pos(1), neu (0), neg (-1)!'
)
axes[1].grid(True)
axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)])
@ -92,7 +130,9 @@ axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h'))
axes[1].tick_params(which='minor', length=0)
plotFileUrl = f'./plots/{TodayDate}.png'
plt.savefig(plotFileUrl)
print('done!')
print('Send toot...')
#media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.")
#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en')
#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en')
print('done!')

View file

@ -1,5 +1,7 @@
from mastodon import Mastodon
class MastodonAccountManager():
def __init__(self):
self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')
from mastodon import Mastodon
class MastodonAccountManager():
"""Initialize the Mastodon account.
"""
def __init__(self):
self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')

View file

@ -1,4 +1,19 @@
# Hedonodon
I'm using [vaderSentiment](https://pypi.org/project/vaderSentiment/) to calculate the compounds.
More Documentation coming soon!
# Hedonodon
## Prerequisites
Install the dependencies with `python -m pip install -r requirements.txt`.
Install SpaCys nlp model with `python -m spacy download en_core_web_lg`.
If the automatic download of the twitter-roberta-base-sentiment model and tokenizer fail, go to the model pages on hugging face (see models section) and download the to the respective folder (cardiffnlp/twitter-roberta-base-sentiment).
## Purpose
Hedonodon fetch toots from fedihum.org and calculates the sentiments, sentiment mean and word frequencies of each day, and creates fancy diagrams from the data.
## Motivation
This tool was created to understand how sentiment analyses and nlp methods works, so it may lacks of proper use of models etc...
## Models
It uses "germansentiment"](https://huggingface.co/oliverguhr/german-sentiment-bert) for german toots, []"twitter-roberta-base-sentiment"](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment) for
english toots, and ["vaderSentiment"](https://pypi.org/project/vaderSentiment/) for other languages.
For the word counts I translate the toots to english with the GoogleTranslator from [deep_translater](https://pypi.org/project/deep-translator/) first and then use SpaCys nlp model ["en_core_web_lg"](https://spacy.io/models/en/) to calculate the word frequencies.
## Weaknesses
Since some moduls do not return sentiment compounds I have to use the nominal sentiment values (positive, neutral, negative) to calculate the mean of the day, which is statisticaly not okay (;-_-).

View file

@ -1,26 +1,43 @@
from germansentiment import SentimentModel
from pandas import DataFrame
import numpy as np
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from deep_translator import GoogleTranslator
import spacy
from collections import Counter
# Preprocess text (username and link placeholders)
def preprocess(text):
def preprocess(text:str) -> str:
"""Removes tags and urls from text.
Parameters
------
text: str
The raw toot from Mastodon.
Returns
------
str
The cleaned text.
"""
new_text = []
for t in text.split(" "):
t = '@user' if t.startswith('@') and len(t) > 1 else t
t = 'http' if t.startswith('http') else t
t = '' if t.startswith('@') and len(t) > 1 else t
t = '' if t.startswith('http') else t
new_text.append(t)
return " ".join(new_text)
class SentiTooter:
""""""
"""Class to analyze the toots.
"""
def __init__(self):
"""Initilize the sentiment models and labels.
"""
self.deModel = SentimentModel()
self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
self.enModel, self.enTokenizer = self.initModel()
@ -28,7 +45,20 @@ class SentiTooter:
self.labels = ['negative', 'neutral', 'positive']
self.sia = SentimentIntensityAnalyzer()
def analyze(self, language, content):
def analyze(self, language:str, content:str) -> list[str, str, float]:
"""Analyzes the sentiments of the toots.
Parameters
------
language: str
The language tag of the toot.
content: str
The toot content.
Returns
------
list[str, str, float]
A list with the sentiment, analyzer type, and sentiment score.
"""
match language:
case 'de':
sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
@ -41,15 +71,15 @@ class SentiTooter:
output = self.enModel(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
print(scores)
#print(scores)
sentimentIndexWithMaxScore = np.argmax(scores)
sentimentLabel = self.labels[sentimentIndexWithMaxScore]
sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment', max(scores)]
print(sentiment)
#print(sentiment)
return sentiment
case _:
compound = self.sia.polarity_scores(content)['compound']
print(self.sia.polarity_scores(content), 'vaderSentiment')
#print(self.sia.polarity_scores(content), 'vaderSentiment')
if compound > (1 / 3):
return ['positive', 'vaderSentiment']
elif compound < (-1 / 3):
@ -58,8 +88,14 @@ class SentiTooter:
return ['neutral', 'vaderSentiment']
def initModel(self):
"""Initialize the english models.
Returns
------
tupel
The pretrained model and tokenizer.
"""
# PT
tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
tokenizer.save_pretrained(self.enModelType)
@ -67,12 +103,93 @@ class SentiTooter:
model.save_pretrained(self.enModelType)
return model, tokenizer
# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
def translateToots(yesterdaysToots:DataFrame) -> DataFrame:
"""Translates all toots to english.
# text = "Good night 😊"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)
Returns
------
Dataframe
Containing the english translated toots.
"""
yesterdaysTootsTranslated = yesterdaysToots
for index, row in yesterdaysTootsTranslated.iterrows():
if (row['language'] != 'en'):
try:
yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot'])
yesterdaysTootsTranslated.at[index,'language'] = 'en'
except:
yesterdaysTootsTranslated.drop(index)
return yesterdaysTootsTranslated
def translateToot(language:str, toot:str) -> str:
"""Translate a toot in english.
Parameters
------
language:str
The language of the toot.
toot: str
The toot content.
Returns
------
str
The in english translated toot.
"""
content = preprocess(toot)
return GoogleTranslator(source=language, target='en').translate(content)
def countWords(concatedToots: str, number: int) -> list:
"""Counts the word frequencies in all toots of a given sentiment.
Parameters
------
concatedToots: str
All toots from a sentiment.
number: int
Number of words to calculate word frequencies.
Returns
------
list
List containing tuple of word and word frequency.
"""
nlp = spacy.load('en_core_web_lg')
doc = nlp(concatedToots)
# noun tokens that arent stop words or punctuations
nouns = [token.text
for token in doc
if (not token.is_stop and
not token.is_punct and
token.pos_ == "NOUN")]
# five most common noun tokens
noun_freq = Counter(nouns)
return noun_freq.most_common(number)
def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str:
"""Count all word frequencies of all toots per sentiment.
Paramters
------
translatedToots: DataFrame
The dataframe with all toots in english.
Returns
------
str
Containing words and wourd counts per sentiment.
"""
sentimentList = []
for sentiment in ['positive', 'neutral', 'negative']:
tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot
wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5)
FrequenciesList = []
for Frequencies in wordFrequencies:
FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')')
list2String = ', '.join(FrequenciesList)
sentimentString = sentiment + ': ' + list2String
sentimentList.append(sentimentString)
wordFrequenciessPerSentiments = '\n'.join(sentimentList)
return wordFrequenciessPerSentiments

View file

@ -1,32 +1,32 @@
from DbSetup import Base
from sqlalchemy import Column, Date, Integer, Float, String
class Toots(Base):
__tablename__ = 'Toots'
__table_args__ = {'extend_existing': True}
index = Column(Integer, primary_key=True)
model = Column(String(30))
datetime = Column(Date)
language = Column(String(3))
sentiment = Column(String(8))
tootId = Column(String(255))
toot = Column(String(600))
userName = Column(String(255))
userId = Column(String(255))
class SentimentCounts(Base):
__tablename__ = 'SentimentCounts'
__table_args__ = {'extend_existing': True}
index = Column(Integer, primary_key=True)
sentimentCount = Column(Integer)
date = Column(Date, primary_key=True)
sentiment = Column(String(8))
class SentimentMeans(Base):
__tablename__ = 'SentimentMeans'
__table_args__ = {'extend_existing': True}
index = Column(Integer, primary_key=True)
date = Column(Date, primary_key=True)
"""This script containing the table definitions for the database."""
from DbSetup import Base
from sqlalchemy import Column, Date, Integer, Float, String
class Toots(Base):
__tablename__ = 'Toots'
__table_args__ = {'extend_existing': True}
index = Column(Integer, primary_key=True)
model = Column(String(30))
datetime = Column(Date)
language = Column(String(3))
sentiment = Column(String(8))
tootId = Column(String(255))
toot = Column(String(600))
userName = Column(String(255))
userId = Column(String(255))
class SentimentCounts(Base):
__tablename__ = 'SentimentCounts'
__table_args__ = {'extend_existing': True}
index = Column(Integer, primary_key=True)
sentimentCount = Column(Integer)
date = Column(Date, primary_key=True)
sentiment = Column(String(8))
class SentimentMeans(Base):
__tablename__ = 'SentimentMeans'
__table_args__ = {'extend_existing': True}
index = Column(Integer, primary_key=True)
date = Column(Date, primary_key=True)
SentimentsMean = Column(Float)

View file

@ -1,48 +1,111 @@
from langdetect import detect
import pytz
import pandas as pd
import re
from SentiTooter import SentiTooter
from pprint import pprint
class TootCrawler():
def __init__(self, mastodonInstance) -> None:
self.mastodonInstance = mastodonInstance
self.compilePattern = re.compile('<.*?>')
self.sentiTooter = SentiTooter()
self.localTimezone = pytz.timezone('Europe/Berlin')
def getLocalTimeline(self, minId=None):
return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
def cleanhtml(self, raw_html):
cleantext = re.sub(self.compilePattern, '', raw_html)
cleantext = re.sub(r'http\S+', '', cleantext)
return cleantext
def buildTootsDataframe(self, minId=None):
toots = []
allTimelineResults = []
timelinePagination = self.getLocalTimeline(minId)
while timelinePagination:
allTimelineResults = allTimelineResults + timelinePagination
timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination)
for i in allTimelineResults:
content = self.cleanhtml(i.content)
language = detect(content)
sentiment = self.sentiTooter.analyze(language, content)
toot = {
"sentiment": sentiment[0],
"model": sentiment[1],
"toot": content,
"datetime": i.created_at.astimezone(self.localTimezone),
"language": language,
"userName": i.account.display_name,
"userId": i.account.id,
"tootId": i.id
}
toots.append(toot)
toots.sort(key=lambda item:item.get('datetime'))
from langdetect import detect
import pytz
import pandas as pd
from pandas import DataFrame
import re
from SentiTooter import SentiTooter
from pprint import pprint
class TootCrawler():
"""Class to fetch the recent toots from fedihum.org."""
def __init__(self, mastodonInstance: any) -> None:
"""Initialize the Mastodon instance and depending classes.
Parameters
------
mastodonInstance: any
The initialized Mastodon instance.
"""
self.mastodonInstance = mastodonInstance
self.compilePattern = re.compile('<.*?>')
self.sentiTooter = SentiTooter()
self.localTimezone = pytz.timezone('Europe/Berlin')
def getLocalTimeline(self, minId=None) -> any:
"""Receave the local timeline
Parameters
------
minId: str | None
The last fetched toot id from the database.
Returns
------
any
The local Mastodon timeline from fedihum.org.
"""
return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
def cleanhtml(self, raw_html:str) -> str:
"""remove brackets and http string from toots
Parameters
------
raw_html: str
The toot content.
Returns
------
str:
The cleaned toot content.
"""
cleantext = re.sub(self.compilePattern, '', raw_html)
cleantext = re.sub(r'http\S+', '', cleantext)
return cleantext
def buildTootsDataframe(self, minId=None) -> DataFrame:
"""Parse fetched toots from Mastodon to dataframe.
Parameters
------
minId: str | None
The id of the last fetched toot.
Returns
------
DataFrame
A Dataframe containing
sentiment: str
The sentiment (positive, neutral, negative)
model: str
The used sentiment model.
toot: str
The content of the toot.
datetime: datetime
The datetime of the toot.
language: str
The langage flag of the toot.
userName: str.
The user name of the toot.
userId: str
The user id.
tootId: str
The toot id.
"""
toots = []
allTimelineResults = []
timelinePagination = self.getLocalTimeline(minId)
while timelinePagination:
allTimelineResults = allTimelineResults + timelinePagination
timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination)
for i in allTimelineResults:
content = self.cleanhtml(i.content)
try:
language = detect(content)
except:
language = None
sentiment = self.sentiTooter.analyze(language, content)
toot = {
"sentiment": sentiment[0],
"model": sentiment[1],
"toot": content,
"datetime": i.created_at.astimezone(self.localTimezone),
"language": language,
"userName": i.account.display_name,
"userId": i.account.id,
"tootId": i.id
}
toots.append(toot)
toots.sort(key=lambda item:item.get('datetime'))
return pd.DataFrame.from_records(toots)

View file

@ -3,6 +3,12 @@ matplotlib
pandas
sqlalchemy
vader-multi
langdetect
numpy
pytz
transformers
transformers
wheel
germansentiment
scipy
deep_translator
spacy