Compare commits
11 commits
with_cites
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
03792f2120 | ||
|
|
cafda77e7f | ||
|
|
8d9a7fa603 | ||
|
|
bc842244c7 | ||
|
|
4479bd2429 | ||
|
|
6a8caac29e | ||
|
|
09fd313a89 | ||
|
|
3b677e5713 | ||
|
|
8f7c578087 | ||
|
|
79f54079f7 | ||
|
|
2b98565444 |
10 changed files with 473 additions and 149 deletions
23
.gitignore
vendored
23
.gitignore
vendored
|
|
@ -1,11 +1,12 @@
|
||||||
database.db
|
database.db
|
||||||
plots
|
plots
|
||||||
instance
|
instance
|
||||||
__pycache__
|
__pycache__
|
||||||
hedonodon_clientcred.secret
|
hedonodon_clientcred.secret
|
||||||
hedonodon_usercred.secret
|
hedonodon_usercred.secret
|
||||||
.fleet
|
.fleet
|
||||||
test.py
|
test.py
|
||||||
.idea
|
.idea
|
||||||
cardiffnlp
|
cardiffnlp
|
||||||
venv
|
venv
|
||||||
|
logs.txt
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,21 @@
|
||||||
from DbSetup import engine, session, databaseUrl
|
from DbSetup import connection, engine, session, databaseUrl
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sqlalchemy import desc, select
|
from sqlalchemy import desc, select, sql
|
||||||
from Tables import Toots
|
from Tables import Toots
|
||||||
|
|
||||||
|
from pandas.core.api import (
|
||||||
|
DataFrame)
|
||||||
|
|
||||||
def calculateSentimentCount():
|
def calculateSentimentCount():
|
||||||
|
"""Calculates the frequencies of the sentiments.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
DataFrame
|
||||||
|
Containing date (YY-MM-DD), sentiment (positive, neutral, negative),
|
||||||
|
and sentimentCount.
|
||||||
|
"""
|
||||||
|
|
||||||
query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount
|
query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount
|
||||||
FROM Toots
|
FROM Toots
|
||||||
GROUP BY DATE(datetime),
|
GROUP BY DATE(datetime),
|
||||||
|
|
@ -12,12 +23,23 @@ def calculateSentimentCount():
|
||||||
HAVING datetime >= DATE("now","-1 day")
|
HAVING datetime >= DATE("now","-1 day")
|
||||||
AND datetime < DATE("now")'''
|
AND datetime < DATE("now")'''
|
||||||
return pd.read_sql(
|
return pd.read_sql(
|
||||||
query,
|
sql.text(query),
|
||||||
databaseUrl,
|
connection,
|
||||||
parse_dates=["datetime"]
|
parse_dates=["datetime"]
|
||||||
)
|
)
|
||||||
|
|
||||||
def calculateSentimentMean(dataframe):
|
def calculateSentimentMean(dataframe:DataFrame) -> DataFrame:
|
||||||
|
"""Calculates the mean of the sentiments.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
-------
|
||||||
|
dataframe: DataFrame
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Dataframe
|
||||||
|
Containing date (YY-MM-DD), sentimentsMean.
|
||||||
|
"""
|
||||||
negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1
|
negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1
|
||||||
positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum()
|
positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum()
|
||||||
sentimentSum = dataframe['sentimentCount'].sum()
|
sentimentSum = dataframe['sentimentCount'].sum()
|
||||||
|
|
@ -32,17 +54,68 @@ def calculateSentimentMean(dataframe):
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
class CRUDManager():
|
def getYesterdaysToots() -> DataFrame:
|
||||||
|
"""Fetches yesterdays toots from database.
|
||||||
|
|
||||||
def saveToDatabase(self, dataframe, table:str, useIndex=False):
|
Returns
|
||||||
|
-------
|
||||||
|
pd.Dataframe
|
||||||
|
Containing date (YY-MM-DD), language, sentiment, toot.
|
||||||
|
"""
|
||||||
|
query = f'''SELECT datetime as date, language, sentiment, toot
|
||||||
|
FROM Toots
|
||||||
|
WHERE datetime >= DATE("now","-1 day")
|
||||||
|
AND datetime < DATE("now")'''
|
||||||
|
return pd.read_sql(
|
||||||
|
sql.text(query),
|
||||||
|
connection,
|
||||||
|
parse_dates=["datetime"]
|
||||||
|
)
|
||||||
|
|
||||||
|
class CRUDManager():
|
||||||
|
"""Class for database operations"""
|
||||||
|
|
||||||
|
def saveToDatabase(self, dataframe:DataFrame, table:str, useIndex=False):
|
||||||
|
"""Saves dataframe to database.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
-------
|
||||||
|
dataframe: DataFrame
|
||||||
|
Input dataframe.
|
||||||
|
table: str
|
||||||
|
Table, where to save the data.
|
||||||
|
useIndex: boolean
|
||||||
|
Should the index of the dataframe be used as index for
|
||||||
|
the database table?
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
dataframe.to_sql(table, engine, index=useIndex, if_exists="append")
|
dataframe.to_sql(table, engine, index=useIndex, if_exists="append")
|
||||||
except:
|
except:
|
||||||
print(f'Could not save data to {table}!')
|
print(f'Could not save data to {table}!')
|
||||||
|
|
||||||
def loadFromDatabase(self, table:str, indexColumn=None):
|
def loadFromDatabase(self, table:str, indexColumn=None) -> DataFrame:
|
||||||
return pd.read_sql_table(table, databaseUrl, index_col=indexColumn)
|
"""Load a table into a dataframe.
|
||||||
|
|
||||||
def getLastToot(self):
|
Parameters
|
||||||
|
-------
|
||||||
|
table: str
|
||||||
|
Table, where to save the data.
|
||||||
|
indexColumn: str | None
|
||||||
|
Should the index of the table be used as index for
|
||||||
|
the dataframe?
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
DataFrame
|
||||||
|
"""
|
||||||
|
return pd.read_sql_table(table, connection, index_col=indexColumn)
|
||||||
|
|
||||||
|
def getLastToot(self) -> str:
|
||||||
|
"""Query the last toot id from database.
|
||||||
|
|
||||||
|
Results
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
A toot id.
|
||||||
|
"""
|
||||||
stmt = select(Toots.tootId).order_by(desc('datetime'))
|
stmt = select(Toots.tootId).order_by(desc('datetime'))
|
||||||
return session.scalars(stmt).first()
|
return session.scalars(stmt).first()
|
||||||
|
|
|
||||||
29
DbSetup.py
29
DbSetup.py
|
|
@ -1,11 +1,18 @@
|
||||||
from sqlalchemy import create_engine
|
"""Script to initialize the database.
|
||||||
from sqlalchemy.orm import Session
|
Serves database url, engine, connection and session.
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
"""
|
||||||
|
|
||||||
databaseUrl = 'sqlite:///database.db'
|
from sqlalchemy import create_engine
|
||||||
engine = create_engine(databaseUrl, future=True)
|
from sqlalchemy.orm import Session
|
||||||
session = Session(engine)
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
Base = declarative_base()
|
|
||||||
|
databaseUrl = 'sqlite:///database.db'
|
||||||
def init_db():
|
engine = create_engine(databaseUrl, future=True)
|
||||||
Base.metadata.create_all(bind=engine)
|
connection = engine.connect()
|
||||||
|
session = Session(engine)
|
||||||
|
Base = declarative_base()
|
||||||
|
|
||||||
|
def init_db():
|
||||||
|
"""Initialize the database.
|
||||||
|
"""
|
||||||
|
Base.metadata.create_all(bind=engine)
|
||||||
|
|
|
||||||
60
Main.py
60
Main.py
|
|
@ -1,4 +1,20 @@
|
||||||
from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean
|
"""
|
||||||
|
Hedonodon toot sentiment analyzer.
|
||||||
|
|
||||||
|
This programm fetches toots from the fedihum.org Mastodon instance, calculates
|
||||||
|
the frequencies of the sentiments (positive, neutral, negative) and the mean
|
||||||
|
from these nominal values (even this is not statistical correct (;-_-)!, but
|
||||||
|
not all analyzer return compounds).
|
||||||
|
It also calculates the word count of the nouns per sentiment.
|
||||||
|
|
||||||
|
It uses germansentiment for german toots, twitter-roberta-base-sentiment for
|
||||||
|
english toots, and vaderSentiment for other languages.
|
||||||
|
|
||||||
|
For the word counts I translate the toots to english with the GoogleTranslator
|
||||||
|
first.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
from DbSetup import init_db
|
from DbSetup import init_db
|
||||||
import locale
|
import locale
|
||||||
|
|
@ -6,10 +22,12 @@ from MastodonAccountManager import MastodonAccountManager
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import matplotlib.dates as mdates
|
import matplotlib.dates as mdates
|
||||||
from TootCrawler import TootCrawler
|
from TootCrawler import TootCrawler
|
||||||
|
from SentiTooter import translateToots, createWordFrequenciesPerSentiment
|
||||||
|
|
||||||
locale.setlocale(locale.LC_TIME, "en_EN.UTF-8")
|
locale.setlocale(locale.LC_TIME, "en_US.UTF-8")
|
||||||
init_db()
|
init_db()
|
||||||
|
|
||||||
|
print('Initialize Mastodon...')
|
||||||
mastodonAccountManager = MastodonAccountManager()
|
mastodonAccountManager = MastodonAccountManager()
|
||||||
mastodonInstance = mastodonAccountManager.instance
|
mastodonInstance = mastodonAccountManager.instance
|
||||||
"""
|
"""
|
||||||
|
|
@ -19,27 +37,47 @@ mastodonInstance.log_in(
|
||||||
to_file = 'hedonodon_usercred.secret'
|
to_file = 'hedonodon_usercred.secret'
|
||||||
)
|
)
|
||||||
"""
|
"""
|
||||||
|
print('done!')
|
||||||
|
|
||||||
|
print('Fetching recent toots...')
|
||||||
tootCrawler = TootCrawler(mastodonInstance)
|
tootCrawler = TootCrawler(mastodonInstance)
|
||||||
crudManager = CRUDManager()
|
crudManager = CRUDManager()
|
||||||
|
|
||||||
lastTootId = crudManager.getLastToot()
|
lastTootId = crudManager.getLastToot()
|
||||||
tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId)
|
tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId)
|
||||||
exit()
|
print('done!')
|
||||||
|
|
||||||
|
print('Save toots to database...')
|
||||||
if not tootsDataframe.empty:
|
if not tootsDataframe.empty:
|
||||||
crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False)
|
crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False)
|
||||||
|
print('done!')
|
||||||
else:
|
else:
|
||||||
print('Nothing changed since last database insert!')
|
print('nothing changed since last database insert!')
|
||||||
|
|
||||||
|
print('Calculate word counts...')
|
||||||
|
yesterdaysToots = getYesterdaysToots()
|
||||||
|
translatedToots = translateToots(yesterdaysToots)
|
||||||
|
wordCountsPerSentiment = createWordFrequenciesPerSentiment(translatedToots)
|
||||||
|
print('done!')
|
||||||
|
|
||||||
|
print(wordCountsPerSentiment);
|
||||||
|
|
||||||
|
print('Calculate sentiment counts...')
|
||||||
sentimentsYesterday = calculateSentimentCount()
|
sentimentsYesterday = calculateSentimentCount()
|
||||||
sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday)
|
print('done!')
|
||||||
|
|
||||||
|
print('Calculate sentiment mean...')
|
||||||
|
sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday)
|
||||||
|
print('done!')
|
||||||
|
|
||||||
|
print('Save calculations to database...')
|
||||||
if not tootsDataframe.empty:
|
if not tootsDataframe.empty:
|
||||||
crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True)
|
crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True)
|
||||||
crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True)
|
crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True)
|
||||||
|
print('done!')
|
||||||
else:
|
else:
|
||||||
print('Nothing changed since last database insert!')
|
print('nothing changed since last database insert!')
|
||||||
|
|
||||||
|
print('Create figure...')
|
||||||
colormap = {
|
colormap = {
|
||||||
'negative': '#ff9999',
|
'negative': '#ff9999',
|
||||||
'neutral': '#ffcc99',
|
'neutral': '#ffcc99',
|
||||||
|
|
@ -80,7 +118,7 @@ axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9))
|
||||||
# Line chart.
|
# Line chart.
|
||||||
lineChart = dataframe4LineChart.plot.line(
|
lineChart = dataframe4LineChart.plot.line(
|
||||||
ax=axes[1],
|
ax=axes[1],
|
||||||
title='Mean of all sentiments from max positive (1) to min negative (-1)'
|
title='"Mean" of sentiments, calculated from nominal values, pos(1), neu (0), neg (-1)!'
|
||||||
)
|
)
|
||||||
axes[1].grid(True)
|
axes[1].grid(True)
|
||||||
axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)])
|
axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)])
|
||||||
|
|
@ -92,7 +130,9 @@ axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h'))
|
||||||
axes[1].tick_params(which='minor', length=0)
|
axes[1].tick_params(which='minor', length=0)
|
||||||
plotFileUrl = f'./plots/{TodayDate}.png'
|
plotFileUrl = f'./plots/{TodayDate}.png'
|
||||||
plt.savefig(plotFileUrl)
|
plt.savefig(plotFileUrl)
|
||||||
|
print('done!')
|
||||||
|
|
||||||
|
print('Send toot...')
|
||||||
#media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.")
|
#media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.")
|
||||||
#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en')
|
#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en')
|
||||||
|
print('done!')
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,7 @@
|
||||||
from mastodon import Mastodon
|
from mastodon import Mastodon
|
||||||
|
|
||||||
class MastodonAccountManager():
|
class MastodonAccountManager():
|
||||||
def __init__(self):
|
"""Initialize the Mastodon account.
|
||||||
self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')
|
||||||
|
|
|
||||||
23
README.md
23
README.md
|
|
@ -1,4 +1,19 @@
|
||||||
# Hedonodon
|
# Hedonodon
|
||||||
I'm using [vaderSentiment](https://pypi.org/project/vaderSentiment/) to calculate the compounds.
|
## Prerequisites
|
||||||
|
Install the dependencies with `python -m pip install -r requirements.txt`.
|
||||||
More Documentation coming soon!
|
Install SpaCys nlp model with `python -m spacy download en_core_web_lg`.
|
||||||
|
If the automatic download of the twitter-roberta-base-sentiment model and tokenizer fail, go to the model pages on hugging face (see models section) and download the to the respective folder (cardiffnlp/twitter-roberta-base-sentiment).
|
||||||
|
|
||||||
|
## Purpose
|
||||||
|
Hedonodon fetch toots from fedihum.org and calculates the sentiments, sentiment mean and word frequencies of each day, and creates fancy diagrams from the data.
|
||||||
|
|
||||||
|
## Motivation
|
||||||
|
This tool was created to understand how sentiment analyses and nlp methods works, so it may lacks of proper use of models etc...
|
||||||
|
|
||||||
|
## Models
|
||||||
|
It uses "germansentiment"](https://huggingface.co/oliverguhr/german-sentiment-bert) for german toots, []"twitter-roberta-base-sentiment"](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment) for
|
||||||
|
english toots, and ["vaderSentiment"](https://pypi.org/project/vaderSentiment/) for other languages.
|
||||||
|
For the word counts I translate the toots to english with the GoogleTranslator from [deep_translater](https://pypi.org/project/deep-translator/) first and then use SpaCys nlp model ["en_core_web_lg"](https://spacy.io/models/en/) to calculate the word frequencies.
|
||||||
|
|
||||||
|
## Weaknesses
|
||||||
|
Since some moduls do not return sentiment compounds I have to use the nominal sentiment values (positive, neutral, negative) to calculate the mean of the day, which is statisticaly not okay (;-_-).
|
||||||
153
SentiTooter.py
153
SentiTooter.py
|
|
@ -1,26 +1,43 @@
|
||||||
from germansentiment import SentimentModel
|
from germansentiment import SentimentModel
|
||||||
|
from pandas import DataFrame
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.special import softmax
|
from scipy.special import softmax
|
||||||
from transformers import AutoModelForSequenceClassification
|
from transformers import AutoModelForSequenceClassification
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||||||
|
from deep_translator import GoogleTranslator
|
||||||
|
import spacy
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
# Preprocess text (username and link placeholders)
|
# Preprocess text (username and link placeholders)
|
||||||
def preprocess(text):
|
def preprocess(text:str) -> str:
|
||||||
|
"""Removes tags and urls from text.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
------
|
||||||
|
text: str
|
||||||
|
The raw toot from Mastodon.
|
||||||
|
Returns
|
||||||
|
------
|
||||||
|
str
|
||||||
|
The cleaned text.
|
||||||
|
"""
|
||||||
new_text = []
|
new_text = []
|
||||||
|
|
||||||
for t in text.split(" "):
|
for t in text.split(" "):
|
||||||
t = '@user' if t.startswith('@') and len(t) > 1 else t
|
t = '' if t.startswith('@') and len(t) > 1 else t
|
||||||
t = 'http' if t.startswith('http') else t
|
t = '' if t.startswith('http') else t
|
||||||
new_text.append(t)
|
new_text.append(t)
|
||||||
return " ".join(new_text)
|
return " ".join(new_text)
|
||||||
|
|
||||||
|
|
||||||
class SentiTooter:
|
class SentiTooter:
|
||||||
""""""
|
"""Class to analyze the toots.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
"""Initilize the sentiment models and labels.
|
||||||
|
"""
|
||||||
self.deModel = SentimentModel()
|
self.deModel = SentimentModel()
|
||||||
self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
|
self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
|
||||||
self.enModel, self.enTokenizer = self.initModel()
|
self.enModel, self.enTokenizer = self.initModel()
|
||||||
|
|
@ -28,7 +45,20 @@ class SentiTooter:
|
||||||
self.labels = ['negative', 'neutral', 'positive']
|
self.labels = ['negative', 'neutral', 'positive']
|
||||||
self.sia = SentimentIntensityAnalyzer()
|
self.sia = SentimentIntensityAnalyzer()
|
||||||
|
|
||||||
def analyze(self, language, content):
|
def analyze(self, language:str, content:str) -> list[str, str, float]:
|
||||||
|
"""Analyzes the sentiments of the toots.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
------
|
||||||
|
language: str
|
||||||
|
The language tag of the toot.
|
||||||
|
content: str
|
||||||
|
The toot content.
|
||||||
|
Returns
|
||||||
|
------
|
||||||
|
list[str, str, float]
|
||||||
|
A list with the sentiment, analyzer type, and sentiment score.
|
||||||
|
"""
|
||||||
match language:
|
match language:
|
||||||
case 'de':
|
case 'de':
|
||||||
sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
|
sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
|
||||||
|
|
@ -41,15 +71,15 @@ class SentiTooter:
|
||||||
output = self.enModel(**encoded_input)
|
output = self.enModel(**encoded_input)
|
||||||
scores = output[0][0].detach().numpy()
|
scores = output[0][0].detach().numpy()
|
||||||
scores = softmax(scores)
|
scores = softmax(scores)
|
||||||
print(scores)
|
#print(scores)
|
||||||
sentimentIndexWithMaxScore = np.argmax(scores)
|
sentimentIndexWithMaxScore = np.argmax(scores)
|
||||||
sentimentLabel = self.labels[sentimentIndexWithMaxScore]
|
sentimentLabel = self.labels[sentimentIndexWithMaxScore]
|
||||||
sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment', max(scores)]
|
sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment', max(scores)]
|
||||||
print(sentiment)
|
#print(sentiment)
|
||||||
return sentiment
|
return sentiment
|
||||||
case _:
|
case _:
|
||||||
compound = self.sia.polarity_scores(content)['compound']
|
compound = self.sia.polarity_scores(content)['compound']
|
||||||
print(self.sia.polarity_scores(content), 'vaderSentiment')
|
#print(self.sia.polarity_scores(content), 'vaderSentiment')
|
||||||
if compound > (1 / 3):
|
if compound > (1 / 3):
|
||||||
return ['positive', 'vaderSentiment']
|
return ['positive', 'vaderSentiment']
|
||||||
elif compound < (-1 / 3):
|
elif compound < (-1 / 3):
|
||||||
|
|
@ -58,8 +88,14 @@ class SentiTooter:
|
||||||
return ['neutral', 'vaderSentiment']
|
return ['neutral', 'vaderSentiment']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def initModel(self):
|
def initModel(self):
|
||||||
|
"""Initialize the english models.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
------
|
||||||
|
tupel
|
||||||
|
The pretrained model and tokenizer.
|
||||||
|
"""
|
||||||
# PT
|
# PT
|
||||||
tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
|
tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
|
||||||
tokenizer.save_pretrained(self.enModelType)
|
tokenizer.save_pretrained(self.enModelType)
|
||||||
|
|
@ -67,12 +103,93 @@ class SentiTooter:
|
||||||
model.save_pretrained(self.enModelType)
|
model.save_pretrained(self.enModelType)
|
||||||
return model, tokenizer
|
return model, tokenizer
|
||||||
|
|
||||||
# # TF
|
def translateToots(yesterdaysToots:DataFrame) -> DataFrame:
|
||||||
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
|
"""Translates all toots to english.
|
||||||
# model.save_pretrained(MODEL)
|
|
||||||
|
|
||||||
# text = "Good night 😊"
|
Returns
|
||||||
# encoded_input = tokenizer(text, return_tensors='tf')
|
------
|
||||||
# output = model(encoded_input)
|
Dataframe
|
||||||
# scores = output[0][0].numpy()
|
Containing the english translated toots.
|
||||||
# scores = softmax(scores)
|
"""
|
||||||
|
yesterdaysTootsTranslated = yesterdaysToots
|
||||||
|
for index, row in yesterdaysTootsTranslated.iterrows():
|
||||||
|
if (row['language'] != 'en'):
|
||||||
|
try:
|
||||||
|
yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot'])
|
||||||
|
yesterdaysTootsTranslated.at[index,'language'] = 'en'
|
||||||
|
except:
|
||||||
|
yesterdaysTootsTranslated.drop(index)
|
||||||
|
return yesterdaysTootsTranslated
|
||||||
|
|
||||||
|
def translateToot(language:str, toot:str) -> str:
|
||||||
|
"""Translate a toot in english.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
------
|
||||||
|
language:str
|
||||||
|
The language of the toot.
|
||||||
|
toot: str
|
||||||
|
The toot content.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
------
|
||||||
|
str
|
||||||
|
The in english translated toot.
|
||||||
|
"""
|
||||||
|
content = preprocess(toot)
|
||||||
|
return GoogleTranslator(source=language, target='en').translate(content)
|
||||||
|
|
||||||
|
def countWords(concatedToots: str, number: int) -> list:
|
||||||
|
"""Counts the word frequencies in all toots of a given sentiment.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
------
|
||||||
|
concatedToots: str
|
||||||
|
All toots from a sentiment.
|
||||||
|
number: int
|
||||||
|
Number of words to calculate word frequencies.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
------
|
||||||
|
list
|
||||||
|
List containing tuple of word and word frequency.
|
||||||
|
"""
|
||||||
|
nlp = spacy.load('en_core_web_lg')
|
||||||
|
doc = nlp(concatedToots)
|
||||||
|
|
||||||
|
# noun tokens that arent stop words or punctuations
|
||||||
|
nouns = [token.text
|
||||||
|
for token in doc
|
||||||
|
if (not token.is_stop and
|
||||||
|
not token.is_punct and
|
||||||
|
token.pos_ == "NOUN")]
|
||||||
|
|
||||||
|
# five most common noun tokens
|
||||||
|
noun_freq = Counter(nouns)
|
||||||
|
return noun_freq.most_common(number)
|
||||||
|
|
||||||
|
def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str:
|
||||||
|
"""Count all word frequencies of all toots per sentiment.
|
||||||
|
|
||||||
|
Paramters
|
||||||
|
------
|
||||||
|
translatedToots: DataFrame
|
||||||
|
The dataframe with all toots in english.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
------
|
||||||
|
str
|
||||||
|
Containing words and wourd counts per sentiment.
|
||||||
|
"""
|
||||||
|
sentimentList = []
|
||||||
|
for sentiment in ['positive', 'neutral', 'negative']:
|
||||||
|
tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot
|
||||||
|
wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5)
|
||||||
|
FrequenciesList = []
|
||||||
|
for Frequencies in wordFrequencies:
|
||||||
|
FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')')
|
||||||
|
list2String = ', '.join(FrequenciesList)
|
||||||
|
sentimentString = sentiment + ': ' + list2String
|
||||||
|
sentimentList.append(sentimentString)
|
||||||
|
wordFrequenciessPerSentiments = '\n'.join(sentimentList)
|
||||||
|
return wordFrequenciessPerSentiments
|
||||||
62
Tables.py
62
Tables.py
|
|
@ -1,32 +1,32 @@
|
||||||
from DbSetup import Base
|
"""This script containing the table definitions for the database."""
|
||||||
from sqlalchemy import Column, Date, Integer, Float, String
|
|
||||||
|
from DbSetup import Base
|
||||||
class Toots(Base):
|
from sqlalchemy import Column, Date, Integer, Float, String
|
||||||
__tablename__ = 'Toots'
|
|
||||||
__table_args__ = {'extend_existing': True}
|
class Toots(Base):
|
||||||
index = Column(Integer, primary_key=True)
|
__tablename__ = 'Toots'
|
||||||
model = Column(String(30))
|
__table_args__ = {'extend_existing': True}
|
||||||
datetime = Column(Date)
|
index = Column(Integer, primary_key=True)
|
||||||
language = Column(String(3))
|
model = Column(String(30))
|
||||||
sentiment = Column(String(8))
|
datetime = Column(Date)
|
||||||
tootId = Column(String(255))
|
language = Column(String(3))
|
||||||
toot = Column(String(600))
|
sentiment = Column(String(8))
|
||||||
userName = Column(String(255))
|
tootId = Column(String(255))
|
||||||
userId = Column(String(255))
|
toot = Column(String(600))
|
||||||
|
userName = Column(String(255))
|
||||||
|
userId = Column(String(255))
|
||||||
|
|
||||||
class SentimentCounts(Base):
|
class SentimentCounts(Base):
|
||||||
__tablename__ = 'SentimentCounts'
|
__tablename__ = 'SentimentCounts'
|
||||||
__table_args__ = {'extend_existing': True}
|
__table_args__ = {'extend_existing': True}
|
||||||
index = Column(Integer, primary_key=True)
|
index = Column(Integer, primary_key=True)
|
||||||
sentimentCount = Column(Integer)
|
sentimentCount = Column(Integer)
|
||||||
date = Column(Date, primary_key=True)
|
date = Column(Date, primary_key=True)
|
||||||
sentiment = Column(String(8))
|
sentiment = Column(String(8))
|
||||||
|
|
||||||
class SentimentMeans(Base):
|
class SentimentMeans(Base):
|
||||||
__tablename__ = 'SentimentMeans'
|
__tablename__ = 'SentimentMeans'
|
||||||
__table_args__ = {'extend_existing': True}
|
__table_args__ = {'extend_existing': True}
|
||||||
index = Column(Integer, primary_key=True)
|
index = Column(Integer, primary_key=True)
|
||||||
date = Column(Date, primary_key=True)
|
date = Column(Date, primary_key=True)
|
||||||
SentimentsMean = Column(Float)
|
SentimentsMean = Column(Float)
|
||||||
157
TootCrawler.py
157
TootCrawler.py
|
|
@ -1,48 +1,111 @@
|
||||||
from langdetect import detect
|
from langdetect import detect
|
||||||
import pytz
|
import pytz
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import re
|
from pandas import DataFrame
|
||||||
from SentiTooter import SentiTooter
|
import re
|
||||||
from pprint import pprint
|
from SentiTooter import SentiTooter
|
||||||
|
from pprint import pprint
|
||||||
class TootCrawler():
|
|
||||||
|
class TootCrawler():
|
||||||
def __init__(self, mastodonInstance) -> None:
|
"""Class to fetch the recent toots from fedihum.org."""
|
||||||
self.mastodonInstance = mastodonInstance
|
|
||||||
self.compilePattern = re.compile('<.*?>')
|
def __init__(self, mastodonInstance: any) -> None:
|
||||||
self.sentiTooter = SentiTooter()
|
"""Initialize the Mastodon instance and depending classes.
|
||||||
self.localTimezone = pytz.timezone('Europe/Berlin')
|
|
||||||
|
Parameters
|
||||||
def getLocalTimeline(self, minId=None):
|
------
|
||||||
return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
|
mastodonInstance: any
|
||||||
|
The initialized Mastodon instance.
|
||||||
def cleanhtml(self, raw_html):
|
"""
|
||||||
cleantext = re.sub(self.compilePattern, '', raw_html)
|
self.mastodonInstance = mastodonInstance
|
||||||
cleantext = re.sub(r'http\S+', '', cleantext)
|
self.compilePattern = re.compile('<.*?>')
|
||||||
return cleantext
|
self.sentiTooter = SentiTooter()
|
||||||
|
self.localTimezone = pytz.timezone('Europe/Berlin')
|
||||||
def buildTootsDataframe(self, minId=None):
|
|
||||||
toots = []
|
def getLocalTimeline(self, minId=None) -> any:
|
||||||
allTimelineResults = []
|
"""Receave the local timeline
|
||||||
timelinePagination = self.getLocalTimeline(minId)
|
|
||||||
|
Parameters
|
||||||
while timelinePagination:
|
------
|
||||||
allTimelineResults = allTimelineResults + timelinePagination
|
minId: str | None
|
||||||
timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination)
|
The last fetched toot id from the database.
|
||||||
for i in allTimelineResults:
|
|
||||||
content = self.cleanhtml(i.content)
|
Returns
|
||||||
language = detect(content)
|
------
|
||||||
sentiment = self.sentiTooter.analyze(language, content)
|
any
|
||||||
toot = {
|
The local Mastodon timeline from fedihum.org.
|
||||||
"sentiment": sentiment[0],
|
"""
|
||||||
"model": sentiment[1],
|
return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
|
||||||
"toot": content,
|
|
||||||
"datetime": i.created_at.astimezone(self.localTimezone),
|
def cleanhtml(self, raw_html:str) -> str:
|
||||||
"language": language,
|
"""remove brackets and http string from toots
|
||||||
"userName": i.account.display_name,
|
|
||||||
"userId": i.account.id,
|
Parameters
|
||||||
"tootId": i.id
|
------
|
||||||
}
|
raw_html: str
|
||||||
toots.append(toot)
|
The toot content.
|
||||||
toots.sort(key=lambda item:item.get('datetime'))
|
Returns
|
||||||
|
------
|
||||||
|
str:
|
||||||
|
The cleaned toot content.
|
||||||
|
"""
|
||||||
|
cleantext = re.sub(self.compilePattern, '', raw_html)
|
||||||
|
cleantext = re.sub(r'http\S+', '', cleantext)
|
||||||
|
return cleantext
|
||||||
|
|
||||||
|
def buildTootsDataframe(self, minId=None) -> DataFrame:
|
||||||
|
"""Parse fetched toots from Mastodon to dataframe.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
------
|
||||||
|
minId: str | None
|
||||||
|
The id of the last fetched toot.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
------
|
||||||
|
DataFrame
|
||||||
|
A Dataframe containing
|
||||||
|
sentiment: str
|
||||||
|
The sentiment (positive, neutral, negative)
|
||||||
|
model: str
|
||||||
|
The used sentiment model.
|
||||||
|
toot: str
|
||||||
|
The content of the toot.
|
||||||
|
datetime: datetime
|
||||||
|
The datetime of the toot.
|
||||||
|
language: str
|
||||||
|
The langage flag of the toot.
|
||||||
|
userName: str.
|
||||||
|
The user name of the toot.
|
||||||
|
userId: str
|
||||||
|
The user id.
|
||||||
|
tootId: str
|
||||||
|
The toot id.
|
||||||
|
"""
|
||||||
|
toots = []
|
||||||
|
allTimelineResults = []
|
||||||
|
timelinePagination = self.getLocalTimeline(minId)
|
||||||
|
|
||||||
|
while timelinePagination:
|
||||||
|
allTimelineResults = allTimelineResults + timelinePagination
|
||||||
|
timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination)
|
||||||
|
for i in allTimelineResults:
|
||||||
|
content = self.cleanhtml(i.content)
|
||||||
|
try:
|
||||||
|
language = detect(content)
|
||||||
|
except:
|
||||||
|
language = None
|
||||||
|
sentiment = self.sentiTooter.analyze(language, content)
|
||||||
|
toot = {
|
||||||
|
"sentiment": sentiment[0],
|
||||||
|
"model": sentiment[1],
|
||||||
|
"toot": content,
|
||||||
|
"datetime": i.created_at.astimezone(self.localTimezone),
|
||||||
|
"language": language,
|
||||||
|
"userName": i.account.display_name,
|
||||||
|
"userId": i.account.id,
|
||||||
|
"tootId": i.id
|
||||||
|
}
|
||||||
|
toots.append(toot)
|
||||||
|
toots.sort(key=lambda item:item.get('datetime'))
|
||||||
return pd.DataFrame.from_records(toots)
|
return pd.DataFrame.from_records(toots)
|
||||||
|
|
@ -3,6 +3,12 @@ matplotlib
|
||||||
pandas
|
pandas
|
||||||
sqlalchemy
|
sqlalchemy
|
||||||
vader-multi
|
vader-multi
|
||||||
|
langdetect
|
||||||
numpy
|
numpy
|
||||||
pytz
|
pytz
|
||||||
transformers
|
transformers
|
||||||
|
wheel
|
||||||
|
germansentiment
|
||||||
|
scipy
|
||||||
|
deep_translator
|
||||||
|
spacy
|
||||||
Loading…
Add table
Add a link
Reference in a new issue