Compare commits

..

No commits in common. "main" and "with_cites" have entirely different histories.

10 changed files with 147 additions and 471 deletions

23
.gitignore vendored
View file

@ -1,12 +1,11 @@
database.db database.db
plots plots
instance instance
__pycache__ __pycache__
hedonodon_clientcred.secret hedonodon_clientcred.secret
hedonodon_usercred.secret hedonodon_usercred.secret
.fleet .fleet
test.py test.py
.idea .idea
cardiffnlp cardiffnlp
venv venv
logs.txt

View file

@ -1,21 +1,10 @@
from DbSetup import connection, engine, session, databaseUrl from DbSetup import engine, session, databaseUrl
import pandas as pd import pandas as pd
from sqlalchemy import desc, select, sql from sqlalchemy import desc, select
from Tables import Toots from Tables import Toots
from pandas.core.api import (
DataFrame)
def calculateSentimentCount(): def calculateSentimentCount():
"""Calculates the frequencies of the sentiments.
Returns
-------
DataFrame
Containing date (YY-MM-DD), sentiment (positive, neutral, negative),
and sentimentCount.
"""
query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount
FROM Toots FROM Toots
GROUP BY DATE(datetime), GROUP BY DATE(datetime),
@ -23,23 +12,12 @@ def calculateSentimentCount():
HAVING datetime >= DATE("now","-1 day") HAVING datetime >= DATE("now","-1 day")
AND datetime < DATE("now")''' AND datetime < DATE("now")'''
return pd.read_sql( return pd.read_sql(
sql.text(query), query,
connection, databaseUrl,
parse_dates=["datetime"] parse_dates=["datetime"]
) )
def calculateSentimentMean(dataframe:DataFrame) -> DataFrame: def calculateSentimentMean(dataframe):
"""Calculates the mean of the sentiments.
Parameters
-------
dataframe: DataFrame
Returns
-------
Dataframe
Containing date (YY-MM-DD), sentimentsMean.
"""
negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1 negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1
positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum() positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum()
sentimentSum = dataframe['sentimentCount'].sum() sentimentSum = dataframe['sentimentCount'].sum()
@ -54,68 +32,17 @@ def calculateSentimentMean(dataframe:DataFrame) -> DataFrame:
] ]
) )
def getYesterdaysToots() -> DataFrame:
"""Fetches yesterdays toots from database.
Returns
-------
pd.Dataframe
Containing date (YY-MM-DD), language, sentiment, toot.
"""
query = f'''SELECT datetime as date, language, sentiment, toot
FROM Toots
WHERE datetime >= DATE("now","-1 day")
AND datetime < DATE("now")'''
return pd.read_sql(
sql.text(query),
connection,
parse_dates=["datetime"]
)
class CRUDManager(): class CRUDManager():
"""Class for database operations"""
def saveToDatabase(self, dataframe:DataFrame, table:str, useIndex=False): def saveToDatabase(self, dataframe, table:str, useIndex=False):
"""Saves dataframe to database.
Parameters
-------
dataframe: DataFrame
Input dataframe.
table: str
Table, where to save the data.
useIndex: boolean
Should the index of the dataframe be used as index for
the database table?
"""
try: try:
dataframe.to_sql(table, engine, index=useIndex, if_exists="append") dataframe.to_sql(table, engine, index=useIndex, if_exists="append")
except: except:
print(f'Could not save data to {table}!') print(f'Could not save data to {table}!')
def loadFromDatabase(self, table:str, indexColumn=None) -> DataFrame: def loadFromDatabase(self, table:str, indexColumn=None):
"""Load a table into a dataframe. return pd.read_sql_table(table, databaseUrl, index_col=indexColumn)
Parameters def getLastToot(self):
-------
table: str
Table, where to save the data.
indexColumn: str | None
Should the index of the table be used as index for
the dataframe?
Returns
-------
DataFrame
"""
return pd.read_sql_table(table, connection, index_col=indexColumn)
def getLastToot(self) -> str:
"""Query the last toot id from database.
Results
-------
str
A toot id.
"""
stmt = select(Toots.tootId).order_by(desc('datetime')) stmt = select(Toots.tootId).order_by(desc('datetime'))
return session.scalars(stmt).first() return session.scalars(stmt).first()

View file

@ -1,18 +1,11 @@
"""Script to initialize the database. from sqlalchemy import create_engine
Serves database url, engine, connection and session. from sqlalchemy.orm import Session
""" from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine databaseUrl = 'sqlite:///database.db'
from sqlalchemy.orm import Session engine = create_engine(databaseUrl, future=True)
from sqlalchemy.ext.declarative import declarative_base session = Session(engine)
Base = declarative_base()
databaseUrl = 'sqlite:///database.db'
engine = create_engine(databaseUrl, future=True) def init_db():
connection = engine.connect() Base.metadata.create_all(bind=engine)
session = Session(engine)
Base = declarative_base()
def init_db():
"""Initialize the database.
"""
Base.metadata.create_all(bind=engine)

58
Main.py
View file

@ -1,20 +1,4 @@
""" from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean
Hedonodon toot sentiment analyzer.
This programm fetches toots from the fedihum.org Mastodon instance, calculates
the frequencies of the sentiments (positive, neutral, negative) and the mean
from these nominal values (even this is not statistical correct (;-_-)!, but
not all analyzer return compounds).
It also calculates the word count of the nouns per sentiment.
It uses germansentiment for german toots, twitter-roberta-base-sentiment for
english toots, and vaderSentiment for other languages.
For the word counts I translate the toots to english with the GoogleTranslator
first.
"""
from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots
from datetime import datetime, date from datetime import datetime, date
from DbSetup import init_db from DbSetup import init_db
import locale import locale
@ -22,12 +6,10 @@ from MastodonAccountManager import MastodonAccountManager
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.dates as mdates import matplotlib.dates as mdates
from TootCrawler import TootCrawler from TootCrawler import TootCrawler
from SentiTooter import translateToots, createWordFrequenciesPerSentiment
locale.setlocale(locale.LC_TIME, "en_US.UTF-8") locale.setlocale(locale.LC_TIME, "en_EN.UTF-8")
init_db() init_db()
print('Initialize Mastodon...')
mastodonAccountManager = MastodonAccountManager() mastodonAccountManager = MastodonAccountManager()
mastodonInstance = mastodonAccountManager.instance mastodonInstance = mastodonAccountManager.instance
""" """
@ -37,47 +19,27 @@ mastodonInstance.log_in(
to_file = 'hedonodon_usercred.secret' to_file = 'hedonodon_usercred.secret'
) )
""" """
print('done!')
print('Fetching recent toots...')
tootCrawler = TootCrawler(mastodonInstance) tootCrawler = TootCrawler(mastodonInstance)
crudManager = CRUDManager() crudManager = CRUDManager()
lastTootId = crudManager.getLastToot() lastTootId = crudManager.getLastToot()
tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId) tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId)
print('done!') exit()
print('Save toots to database...')
if not tootsDataframe.empty: if not tootsDataframe.empty:
crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False) crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False)
print('done!')
else: else:
print('nothing changed since last database insert!') print('Nothing changed since last database insert!')
print('Calculate word counts...')
yesterdaysToots = getYesterdaysToots()
translatedToots = translateToots(yesterdaysToots)
wordCountsPerSentiment = createWordFrequenciesPerSentiment(translatedToots)
print('done!')
print(wordCountsPerSentiment);
print('Calculate sentiment counts...')
sentimentsYesterday = calculateSentimentCount() sentimentsYesterday = calculateSentimentCount()
print('done!')
print('Calculate sentiment mean...')
sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday) sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday)
print('done!')
print('Save calculations to database...')
if not tootsDataframe.empty: if not tootsDataframe.empty:
crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True) crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True)
crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True) crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True)
print('done!')
else: else:
print('nothing changed since last database insert!') print('Nothing changed since last database insert!')
print('Create figure...')
colormap = { colormap = {
'negative': '#ff9999', 'negative': '#ff9999',
'neutral': '#ffcc99', 'neutral': '#ffcc99',
@ -118,7 +80,7 @@ axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9))
# Line chart. # Line chart.
lineChart = dataframe4LineChart.plot.line( lineChart = dataframe4LineChart.plot.line(
ax=axes[1], ax=axes[1],
title='"Mean" of sentiments, calculated from nominal values, pos(1), neu (0), neg (-1)!' title='Mean of all sentiments from max positive (1) to min negative (-1)'
) )
axes[1].grid(True) axes[1].grid(True)
axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)]) axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)])
@ -130,9 +92,7 @@ axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h'))
axes[1].tick_params(which='minor', length=0) axes[1].tick_params(which='minor', length=0)
plotFileUrl = f'./plots/{TodayDate}.png' plotFileUrl = f'./plots/{TodayDate}.png'
plt.savefig(plotFileUrl) plt.savefig(plotFileUrl)
print('done!')
print('Send toot...')
#media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.") #media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.")
#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en') #mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en')
print('done!')

View file

@ -1,7 +1,5 @@
from mastodon import Mastodon from mastodon import Mastodon
class MastodonAccountManager(): class MastodonAccountManager():
"""Initialize the Mastodon account. def __init__(self):
""" self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')
def __init__(self):
self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')

View file

@ -1,19 +1,4 @@
# Hedonodon # Hedonodon
## Prerequisites I'm using [vaderSentiment](https://pypi.org/project/vaderSentiment/) to calculate the compounds.
Install the dependencies with `python -m pip install -r requirements.txt`.
Install SpaCys nlp model with `python -m spacy download en_core_web_lg`. More Documentation coming soon!
If the automatic download of the twitter-roberta-base-sentiment model and tokenizer fail, go to the model pages on hugging face (see models section) and download the to the respective folder (cardiffnlp/twitter-roberta-base-sentiment).
## Purpose
Hedonodon fetch toots from fedihum.org and calculates the sentiments, sentiment mean and word frequencies of each day, and creates fancy diagrams from the data.
## Motivation
This tool was created to understand how sentiment analyses and nlp methods works, so it may lacks of proper use of models etc...
## Models
It uses "germansentiment"](https://huggingface.co/oliverguhr/german-sentiment-bert) for german toots, []"twitter-roberta-base-sentiment"](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment) for
english toots, and ["vaderSentiment"](https://pypi.org/project/vaderSentiment/) for other languages.
For the word counts I translate the toots to english with the GoogleTranslator from [deep_translater](https://pypi.org/project/deep-translator/) first and then use SpaCys nlp model ["en_core_web_lg"](https://spacy.io/models/en/) to calculate the word frequencies.
## Weaknesses
Since some moduls do not return sentiment compounds I have to use the nominal sentiment values (positive, neutral, negative) to calculate the mean of the day, which is statisticaly not okay (;-_-).

View file

@ -1,43 +1,26 @@
from germansentiment import SentimentModel from germansentiment import SentimentModel
from pandas import DataFrame
import numpy as np import numpy as np
from scipy.special import softmax from scipy.special import softmax
from transformers import AutoModelForSequenceClassification from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from deep_translator import GoogleTranslator
import spacy
from collections import Counter
# Preprocess text (username and link placeholders) # Preprocess text (username and link placeholders)
def preprocess(text:str) -> str: def preprocess(text):
"""Removes tags and urls from text.
Parameters
------
text: str
The raw toot from Mastodon.
Returns
------
str
The cleaned text.
"""
new_text = [] new_text = []
for t in text.split(" "): for t in text.split(" "):
t = '' if t.startswith('@') and len(t) > 1 else t t = '@user' if t.startswith('@') and len(t) > 1 else t
t = '' if t.startswith('http') else t t = 'http' if t.startswith('http') else t
new_text.append(t) new_text.append(t)
return " ".join(new_text) return " ".join(new_text)
class SentiTooter: class SentiTooter:
"""Class to analyze the toots. """"""
"""
def __init__(self): def __init__(self):
"""Initilize the sentiment models and labels.
"""
self.deModel = SentimentModel() self.deModel = SentimentModel()
self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment" self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
self.enModel, self.enTokenizer = self.initModel() self.enModel, self.enTokenizer = self.initModel()
@ -45,20 +28,7 @@ class SentiTooter:
self.labels = ['negative', 'neutral', 'positive'] self.labels = ['negative', 'neutral', 'positive']
self.sia = SentimentIntensityAnalyzer() self.sia = SentimentIntensityAnalyzer()
def analyze(self, language:str, content:str) -> list[str, str, float]: def analyze(self, language, content):
"""Analyzes the sentiments of the toots.
Parameters
------
language: str
The language tag of the toot.
content: str
The toot content.
Returns
------
list[str, str, float]
A list with the sentiment, analyzer type, and sentiment score.
"""
match language: match language:
case 'de': case 'de':
sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True) sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
@ -71,15 +41,15 @@ class SentiTooter:
output = self.enModel(**encoded_input) output = self.enModel(**encoded_input)
scores = output[0][0].detach().numpy() scores = output[0][0].detach().numpy()
scores = softmax(scores) scores = softmax(scores)
#print(scores) print(scores)
sentimentIndexWithMaxScore = np.argmax(scores) sentimentIndexWithMaxScore = np.argmax(scores)
sentimentLabel = self.labels[sentimentIndexWithMaxScore] sentimentLabel = self.labels[sentimentIndexWithMaxScore]
sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment', max(scores)] sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment', max(scores)]
#print(sentiment) print(sentiment)
return sentiment return sentiment
case _: case _:
compound = self.sia.polarity_scores(content)['compound'] compound = self.sia.polarity_scores(content)['compound']
#print(self.sia.polarity_scores(content), 'vaderSentiment') print(self.sia.polarity_scores(content), 'vaderSentiment')
if compound > (1 / 3): if compound > (1 / 3):
return ['positive', 'vaderSentiment'] return ['positive', 'vaderSentiment']
elif compound < (-1 / 3): elif compound < (-1 / 3):
@ -88,14 +58,8 @@ class SentiTooter:
return ['neutral', 'vaderSentiment'] return ['neutral', 'vaderSentiment']
def initModel(self):
"""Initialize the english models.
Returns def initModel(self):
------
tupel
The pretrained model and tokenizer.
"""
# PT # PT
tokenizer = AutoTokenizer.from_pretrained(self.enModelType) tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
tokenizer.save_pretrained(self.enModelType) tokenizer.save_pretrained(self.enModelType)
@ -103,93 +67,12 @@ class SentiTooter:
model.save_pretrained(self.enModelType) model.save_pretrained(self.enModelType)
return model, tokenizer return model, tokenizer
def translateToots(yesterdaysToots:DataFrame) -> DataFrame: # # TF
"""Translates all toots to english. # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
Returns # text = "Good night 😊"
------ # encoded_input = tokenizer(text, return_tensors='tf')
Dataframe # output = model(encoded_input)
Containing the english translated toots. # scores = output[0][0].numpy()
""" # scores = softmax(scores)
yesterdaysTootsTranslated = yesterdaysToots
for index, row in yesterdaysTootsTranslated.iterrows():
if (row['language'] != 'en'):
try:
yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot'])
yesterdaysTootsTranslated.at[index,'language'] = 'en'
except:
yesterdaysTootsTranslated.drop(index)
return yesterdaysTootsTranslated
def translateToot(language:str, toot:str) -> str:
"""Translate a toot in english.
Parameters
------
language:str
The language of the toot.
toot: str
The toot content.
Returns
------
str
The in english translated toot.
"""
content = preprocess(toot)
return GoogleTranslator(source=language, target='en').translate(content)
def countWords(concatedToots: str, number: int) -> list:
"""Counts the word frequencies in all toots of a given sentiment.
Parameters
------
concatedToots: str
All toots from a sentiment.
number: int
Number of words to calculate word frequencies.
Returns
------
list
List containing tuple of word and word frequency.
"""
nlp = spacy.load('en_core_web_lg')
doc = nlp(concatedToots)
# noun tokens that arent stop words or punctuations
nouns = [token.text
for token in doc
if (not token.is_stop and
not token.is_punct and
token.pos_ == "NOUN")]
# five most common noun tokens
noun_freq = Counter(nouns)
return noun_freq.most_common(number)
def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str:
"""Count all word frequencies of all toots per sentiment.
Paramters
------
translatedToots: DataFrame
The dataframe with all toots in english.
Returns
------
str
Containing words and wourd counts per sentiment.
"""
sentimentList = []
for sentiment in ['positive', 'neutral', 'negative']:
tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot
wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5)
FrequenciesList = []
for Frequencies in wordFrequencies:
FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')')
list2String = ', '.join(FrequenciesList)
sentimentString = sentiment + ': ' + list2String
sentimentList.append(sentimentString)
wordFrequenciessPerSentiments = '\n'.join(sentimentList)
return wordFrequenciessPerSentiments

View file

@ -1,32 +1,32 @@
"""This script containing the table definitions for the database.""" from DbSetup import Base
from sqlalchemy import Column, Date, Integer, Float, String
from DbSetup import Base
from sqlalchemy import Column, Date, Integer, Float, String class Toots(Base):
__tablename__ = 'Toots'
class Toots(Base): __table_args__ = {'extend_existing': True}
__tablename__ = 'Toots' index = Column(Integer, primary_key=True)
__table_args__ = {'extend_existing': True} model = Column(String(30))
index = Column(Integer, primary_key=True) datetime = Column(Date)
model = Column(String(30)) language = Column(String(3))
datetime = Column(Date) sentiment = Column(String(8))
language = Column(String(3)) tootId = Column(String(255))
sentiment = Column(String(8)) toot = Column(String(600))
tootId = Column(String(255)) userName = Column(String(255))
toot = Column(String(600)) userId = Column(String(255))
userName = Column(String(255))
userId = Column(String(255))
class SentimentCounts(Base): class SentimentCounts(Base):
__tablename__ = 'SentimentCounts' __tablename__ = 'SentimentCounts'
__table_args__ = {'extend_existing': True} __table_args__ = {'extend_existing': True}
index = Column(Integer, primary_key=True) index = Column(Integer, primary_key=True)
sentimentCount = Column(Integer) sentimentCount = Column(Integer)
date = Column(Date, primary_key=True) date = Column(Date, primary_key=True)
sentiment = Column(String(8)) sentiment = Column(String(8))
class SentimentMeans(Base): class SentimentMeans(Base):
__tablename__ = 'SentimentMeans' __tablename__ = 'SentimentMeans'
__table_args__ = {'extend_existing': True} __table_args__ = {'extend_existing': True}
index = Column(Integer, primary_key=True) index = Column(Integer, primary_key=True)
date = Column(Date, primary_key=True) date = Column(Date, primary_key=True)
SentimentsMean = Column(Float) SentimentsMean = Column(Float)

View file

@ -1,111 +1,48 @@
from langdetect import detect from langdetect import detect
import pytz import pytz
import pandas as pd import pandas as pd
from pandas import DataFrame import re
import re from SentiTooter import SentiTooter
from SentiTooter import SentiTooter from pprint import pprint
from pprint import pprint
class TootCrawler():
class TootCrawler():
"""Class to fetch the recent toots from fedihum.org.""" def __init__(self, mastodonInstance) -> None:
self.mastodonInstance = mastodonInstance
def __init__(self, mastodonInstance: any) -> None: self.compilePattern = re.compile('<.*?>')
"""Initialize the Mastodon instance and depending classes. self.sentiTooter = SentiTooter()
self.localTimezone = pytz.timezone('Europe/Berlin')
Parameters
------ def getLocalTimeline(self, minId=None):
mastodonInstance: any return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
The initialized Mastodon instance.
""" def cleanhtml(self, raw_html):
self.mastodonInstance = mastodonInstance cleantext = re.sub(self.compilePattern, '', raw_html)
self.compilePattern = re.compile('<.*?>') cleantext = re.sub(r'http\S+', '', cleantext)
self.sentiTooter = SentiTooter() return cleantext
self.localTimezone = pytz.timezone('Europe/Berlin')
def buildTootsDataframe(self, minId=None):
def getLocalTimeline(self, minId=None) -> any: toots = []
"""Receave the local timeline allTimelineResults = []
timelinePagination = self.getLocalTimeline(minId)
Parameters
------ while timelinePagination:
minId: str | None allTimelineResults = allTimelineResults + timelinePagination
The last fetched toot id from the database. timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination)
for i in allTimelineResults:
Returns content = self.cleanhtml(i.content)
------ language = detect(content)
any sentiment = self.sentiTooter.analyze(language, content)
The local Mastodon timeline from fedihum.org. toot = {
""" "sentiment": sentiment[0],
return self.mastodonInstance.timeline_local(min_id=minId, limit=500) "model": sentiment[1],
"toot": content,
def cleanhtml(self, raw_html:str) -> str: "datetime": i.created_at.astimezone(self.localTimezone),
"""remove brackets and http string from toots "language": language,
"userName": i.account.display_name,
Parameters "userId": i.account.id,
------ "tootId": i.id
raw_html: str }
The toot content. toots.append(toot)
Returns toots.sort(key=lambda item:item.get('datetime'))
------
str:
The cleaned toot content.
"""
cleantext = re.sub(self.compilePattern, '', raw_html)
cleantext = re.sub(r'http\S+', '', cleantext)
return cleantext
def buildTootsDataframe(self, minId=None) -> DataFrame:
"""Parse fetched toots from Mastodon to dataframe.
Parameters
------
minId: str | None
The id of the last fetched toot.
Returns
------
DataFrame
A Dataframe containing
sentiment: str
The sentiment (positive, neutral, negative)
model: str
The used sentiment model.
toot: str
The content of the toot.
datetime: datetime
The datetime of the toot.
language: str
The langage flag of the toot.
userName: str.
The user name of the toot.
userId: str
The user id.
tootId: str
The toot id.
"""
toots = []
allTimelineResults = []
timelinePagination = self.getLocalTimeline(minId)
while timelinePagination:
allTimelineResults = allTimelineResults + timelinePagination
timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination)
for i in allTimelineResults:
content = self.cleanhtml(i.content)
try:
language = detect(content)
except:
language = None
sentiment = self.sentiTooter.analyze(language, content)
toot = {
"sentiment": sentiment[0],
"model": sentiment[1],
"toot": content,
"datetime": i.created_at.astimezone(self.localTimezone),
"language": language,
"userName": i.account.display_name,
"userId": i.account.id,
"tootId": i.id
}
toots.append(toot)
toots.sort(key=lambda item:item.get('datetime'))
return pd.DataFrame.from_records(toots) return pd.DataFrame.from_records(toots)

View file

@ -3,12 +3,6 @@ matplotlib
pandas pandas
sqlalchemy sqlalchemy
vader-multi vader-multi
langdetect
numpy numpy
pytz pytz
transformers transformers
wheel
germansentiment
scipy
deep_translator
spacy