diff --git a/.gitignore b/.gitignore index 613ca0d..d946327 100644 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,11 @@ -database.db -plots -instance -__pycache__ -hedonodon_clientcred.secret -hedonodon_usercred.secret -.fleet -test.py -.idea -cardiffnlp -venv -logs.txt +database.db +plots +instance +__pycache__ +hedonodon_clientcred.secret +hedonodon_usercred.secret +.fleet +test.py +.idea +cardiffnlp +venv \ No newline at end of file diff --git a/CRUDManager.py b/CRUDManager.py index cd0c7e0..0226fbf 100644 --- a/CRUDManager.py +++ b/CRUDManager.py @@ -1,21 +1,10 @@ -from DbSetup import connection, engine, session, databaseUrl +from DbSetup import engine, session, databaseUrl import pandas as pd -from sqlalchemy import desc, select, sql +from sqlalchemy import desc, select from Tables import Toots -from pandas.core.api import ( - DataFrame) def calculateSentimentCount(): - """Calculates the frequencies of the sentiments. - - Returns - ------- - DataFrame - Containing date (YY-MM-DD), sentiment (positive, neutral, negative), - and sentimentCount. - """ - query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount FROM Toots GROUP BY DATE(datetime), @@ -23,23 +12,12 @@ def calculateSentimentCount(): HAVING datetime >= DATE("now","-1 day") AND datetime < DATE("now")''' return pd.read_sql( - sql.text(query), - connection, + query, + databaseUrl, parse_dates=["datetime"] ) -def calculateSentimentMean(dataframe:DataFrame) -> DataFrame: - """Calculates the mean of the sentiments. - - Parameters - ------- - dataframe: DataFrame - - Returns - ------- - Dataframe - Containing date (YY-MM-DD), sentimentsMean. - """ +def calculateSentimentMean(dataframe): negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1 positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum() sentimentSum = dataframe['sentimentCount'].sum() @@ -54,68 +32,17 @@ def calculateSentimentMean(dataframe:DataFrame) -> DataFrame: ] ) -def getYesterdaysToots() -> DataFrame: - """Fetches yesterdays toots from database. - - Returns - ------- - pd.Dataframe - Containing date (YY-MM-DD), language, sentiment, toot. - """ - query = f'''SELECT datetime as date, language, sentiment, toot - FROM Toots - WHERE datetime >= DATE("now","-1 day") - AND datetime < DATE("now")''' - return pd.read_sql( - sql.text(query), - connection, - parse_dates=["datetime"] - ) - class CRUDManager(): - """Class for database operations""" - def saveToDatabase(self, dataframe:DataFrame, table:str, useIndex=False): - """Saves dataframe to database. - - Parameters - ------- - dataframe: DataFrame - Input dataframe. - table: str - Table, where to save the data. - useIndex: boolean - Should the index of the dataframe be used as index for - the database table? - """ + def saveToDatabase(self, dataframe, table:str, useIndex=False): try: dataframe.to_sql(table, engine, index=useIndex, if_exists="append") except: print(f'Could not save data to {table}!') - def loadFromDatabase(self, table:str, indexColumn=None) -> DataFrame: - """Load a table into a dataframe. + def loadFromDatabase(self, table:str, indexColumn=None): + return pd.read_sql_table(table, databaseUrl, index_col=indexColumn) - Parameters - ------- - table: str - Table, where to save the data. - indexColumn: str | None - Should the index of the table be used as index for - the dataframe? - Returns - ------- - DataFrame - """ - return pd.read_sql_table(table, connection, index_col=indexColumn) - - def getLastToot(self) -> str: - """Query the last toot id from database. - - Results - ------- - str - A toot id. - """ + def getLastToot(self): stmt = select(Toots.tootId).order_by(desc('datetime')) - return session.scalars(stmt).first() + return session.scalars(stmt).first() \ No newline at end of file diff --git a/DbSetup.py b/DbSetup.py index b6179f7..1898556 100644 --- a/DbSetup.py +++ b/DbSetup.py @@ -1,18 +1,11 @@ -"""Script to initialize the database. - Serves database url, engine, connection and session. -""" - -from sqlalchemy import create_engine -from sqlalchemy.orm import Session -from sqlalchemy.ext.declarative import declarative_base - -databaseUrl = 'sqlite:///database.db' -engine = create_engine(databaseUrl, future=True) -connection = engine.connect() -session = Session(engine) -Base = declarative_base() - -def init_db(): - """Initialize the database. - """ - Base.metadata.create_all(bind=engine) +from sqlalchemy import create_engine +from sqlalchemy.orm import Session +from sqlalchemy.ext.declarative import declarative_base + +databaseUrl = 'sqlite:///database.db' +engine = create_engine(databaseUrl, future=True) +session = Session(engine) +Base = declarative_base() + +def init_db(): + Base.metadata.create_all(bind=engine) diff --git a/Main.py b/Main.py index 5bb6532..a64a63c 100644 --- a/Main.py +++ b/Main.py @@ -1,20 +1,4 @@ -""" -Hedonodon toot sentiment analyzer. - -This programm fetches toots from the fedihum.org Mastodon instance, calculates -the frequencies of the sentiments (positive, neutral, negative) and the mean -from these nominal values (even this is not statistical correct (;-_-)!, but -not all analyzer return compounds). -It also calculates the word count of the nouns per sentiment. - -It uses germansentiment for german toots, twitter-roberta-base-sentiment for -english toots, and vaderSentiment for other languages. - -For the word counts I translate the toots to english with the GoogleTranslator -first. -""" - -from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots +from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean from datetime import datetime, date from DbSetup import init_db import locale @@ -22,12 +6,10 @@ from MastodonAccountManager import MastodonAccountManager import matplotlib.pyplot as plt import matplotlib.dates as mdates from TootCrawler import TootCrawler -from SentiTooter import translateToots, createWordFrequenciesPerSentiment -locale.setlocale(locale.LC_TIME, "en_US.UTF-8") +locale.setlocale(locale.LC_TIME, "en_EN.UTF-8") init_db() -print('Initialize Mastodon...') mastodonAccountManager = MastodonAccountManager() mastodonInstance = mastodonAccountManager.instance """ @@ -37,47 +19,27 @@ mastodonInstance.log_in( to_file = 'hedonodon_usercred.secret' ) """ -print('done!') -print('Fetching recent toots...') tootCrawler = TootCrawler(mastodonInstance) crudManager = CRUDManager() + lastTootId = crudManager.getLastToot() tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId) -print('done!') - -print('Save toots to database...') +exit() if not tootsDataframe.empty: crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False) - print('done!') else: - print('nothing changed since last database insert!') + print('Nothing changed since last database insert!') -print('Calculate word counts...') -yesterdaysToots = getYesterdaysToots() -translatedToots = translateToots(yesterdaysToots) -wordCountsPerSentiment = createWordFrequenciesPerSentiment(translatedToots) -print('done!') - -print(wordCountsPerSentiment); - -print('Calculate sentiment counts...') sentimentsYesterday = calculateSentimentCount() -print('done!') - -print('Calculate sentiment mean...') sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday) -print('done!') -print('Save calculations to database...') if not tootsDataframe.empty: crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True) crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True) - print('done!') else: - print('nothing changed since last database insert!') + print('Nothing changed since last database insert!') -print('Create figure...') colormap = { 'negative': '#ff9999', 'neutral': '#ffcc99', @@ -118,7 +80,7 @@ axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9)) # Line chart. lineChart = dataframe4LineChart.plot.line( ax=axes[1], - title='"Mean" of sentiments, calculated from nominal values, pos(1), neu (0), neg (-1)!' + title='Mean of all sentiments from max positive (1) to min negative (-1)' ) axes[1].grid(True) axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)]) @@ -130,9 +92,7 @@ axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h')) axes[1].tick_params(which='minor', length=0) plotFileUrl = f'./plots/{TodayDate}.png' plt.savefig(plotFileUrl) -print('done!') -print('Send toot...') #media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.") -#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en') -print('done!') +#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en') + diff --git a/MastodonAccountManager.py b/MastodonAccountManager.py index 957ac4e..9c51e54 100644 --- a/MastodonAccountManager.py +++ b/MastodonAccountManager.py @@ -1,7 +1,5 @@ -from mastodon import Mastodon - -class MastodonAccountManager(): - """Initialize the Mastodon account. - """ - def __init__(self): - self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret') +from mastodon import Mastodon + +class MastodonAccountManager(): + def __init__(self): + self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret') diff --git a/README.md b/README.md index 58d02b6..fdbe2f7 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,4 @@ -# Hedonodon -## Prerequisites -Install the dependencies with `python -m pip install -r requirements.txt`. -Install SpaCys nlp model with `python -m spacy download en_core_web_lg`. -If the automatic download of the twitter-roberta-base-sentiment model and tokenizer fail, go to the model pages on hugging face (see models section) and download the to the respective folder (cardiffnlp/twitter-roberta-base-sentiment). - -## Purpose -Hedonodon fetch toots from fedihum.org and calculates the sentiments, sentiment mean and word frequencies of each day, and creates fancy diagrams from the data. - -## Motivation -This tool was created to understand how sentiment analyses and nlp methods works, so it may lacks of proper use of models etc... - -## Models -It uses "germansentiment"](https://huggingface.co/oliverguhr/german-sentiment-bert) for german toots, []"twitter-roberta-base-sentiment"](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment) for -english toots, and ["vaderSentiment"](https://pypi.org/project/vaderSentiment/) for other languages. -For the word counts I translate the toots to english with the GoogleTranslator from [deep_translater](https://pypi.org/project/deep-translator/) first and then use SpaCys nlp model ["en_core_web_lg"](https://spacy.io/models/en/) to calculate the word frequencies. - -## Weaknesses -Since some moduls do not return sentiment compounds I have to use the nominal sentiment values (positive, neutral, negative) to calculate the mean of the day, which is statisticaly not okay (;-_-). \ No newline at end of file +# Hedonodon +I'm using [vaderSentiment](https://pypi.org/project/vaderSentiment/) to calculate the compounds. + +More Documentation coming soon! \ No newline at end of file diff --git a/SentiTooter.py b/SentiTooter.py index aabc498..9626078 100644 --- a/SentiTooter.py +++ b/SentiTooter.py @@ -1,43 +1,26 @@ from germansentiment import SentimentModel -from pandas import DataFrame import numpy as np from scipy.special import softmax from transformers import AutoModelForSequenceClassification from transformers import AutoTokenizer from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer -from deep_translator import GoogleTranslator -import spacy -from collections import Counter + # Preprocess text (username and link placeholders) -def preprocess(text:str) -> str: - """Removes tags and urls from text. - - Parameters - ------ - text: str - The raw toot from Mastodon. - Returns - ------ - str - The cleaned text. - """ +def preprocess(text): new_text = [] for t in text.split(" "): - t = '' if t.startswith('@') and len(t) > 1 else t - t = '' if t.startswith('http') else t + t = '@user' if t.startswith('@') and len(t) > 1 else t + t = 'http' if t.startswith('http') else t new_text.append(t) return " ".join(new_text) class SentiTooter: - """Class to analyze the toots. - """ + """""" def __init__(self): - """Initilize the sentiment models and labels. - """ self.deModel = SentimentModel() self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment" self.enModel, self.enTokenizer = self.initModel() @@ -45,20 +28,7 @@ class SentiTooter: self.labels = ['negative', 'neutral', 'positive'] self.sia = SentimentIntensityAnalyzer() - def analyze(self, language:str, content:str) -> list[str, str, float]: - """Analyzes the sentiments of the toots. - - Parameters - ------ - language: str - The language tag of the toot. - content: str - The toot content. - Returns - ------ - list[str, str, float] - A list with the sentiment, analyzer type, and sentiment score. - """ + def analyze(self, language, content): match language: case 'de': sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True) @@ -71,15 +41,15 @@ class SentiTooter: output = self.enModel(**encoded_input) scores = output[0][0].detach().numpy() scores = softmax(scores) - #print(scores) + print(scores) sentimentIndexWithMaxScore = np.argmax(scores) sentimentLabel = self.labels[sentimentIndexWithMaxScore] sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment', max(scores)] - #print(sentiment) + print(sentiment) return sentiment case _: compound = self.sia.polarity_scores(content)['compound'] - #print(self.sia.polarity_scores(content), 'vaderSentiment') + print(self.sia.polarity_scores(content), 'vaderSentiment') if compound > (1 / 3): return ['positive', 'vaderSentiment'] elif compound < (-1 / 3): @@ -88,14 +58,8 @@ class SentiTooter: return ['neutral', 'vaderSentiment'] - def initModel(self): - """Initialize the english models. - Returns - ------ - tupel - The pretrained model and tokenizer. - """ + def initModel(self): # PT tokenizer = AutoTokenizer.from_pretrained(self.enModelType) tokenizer.save_pretrained(self.enModelType) @@ -103,93 +67,12 @@ class SentiTooter: model.save_pretrained(self.enModelType) return model, tokenizer -def translateToots(yesterdaysToots:DataFrame) -> DataFrame: - """Translates all toots to english. + # # TF + # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL) + # model.save_pretrained(MODEL) - Returns - ------ - Dataframe - Containing the english translated toots. - """ - yesterdaysTootsTranslated = yesterdaysToots - for index, row in yesterdaysTootsTranslated.iterrows(): - if (row['language'] != 'en'): - try: - yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot']) - yesterdaysTootsTranslated.at[index,'language'] = 'en' - except: - yesterdaysTootsTranslated.drop(index) - return yesterdaysTootsTranslated - -def translateToot(language:str, toot:str) -> str: - """Translate a toot in english. - - Parameters - ------ - language:str - The language of the toot. - toot: str - The toot content. - - Returns - ------ - str - The in english translated toot. - """ - content = preprocess(toot) - return GoogleTranslator(source=language, target='en').translate(content) - -def countWords(concatedToots: str, number: int) -> list: - """Counts the word frequencies in all toots of a given sentiment. - - Parameters - ------ - concatedToots: str - All toots from a sentiment. - number: int - Number of words to calculate word frequencies. - - Returns - ------ - list - List containing tuple of word and word frequency. - """ - nlp = spacy.load('en_core_web_lg') - doc = nlp(concatedToots) - - # noun tokens that arent stop words or punctuations - nouns = [token.text - for token in doc - if (not token.is_stop and - not token.is_punct and - token.pos_ == "NOUN")] - - # five most common noun tokens - noun_freq = Counter(nouns) - return noun_freq.most_common(number) - -def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str: - """Count all word frequencies of all toots per sentiment. - - Paramters - ------ - translatedToots: DataFrame - The dataframe with all toots in english. - - Returns - ------ - str - Containing words and wourd counts per sentiment. - """ - sentimentList = [] - for sentiment in ['positive', 'neutral', 'negative']: - tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot - wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5) - FrequenciesList = [] - for Frequencies in wordFrequencies: - FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')') - list2String = ', '.join(FrequenciesList) - sentimentString = sentiment + ': ' + list2String - sentimentList.append(sentimentString) - wordFrequenciessPerSentiments = '\n'.join(sentimentList) - return wordFrequenciessPerSentiments \ No newline at end of file + # text = "Good night 😊" + # encoded_input = tokenizer(text, return_tensors='tf') + # output = model(encoded_input) + # scores = output[0][0].numpy() + # scores = softmax(scores) diff --git a/Tables.py b/Tables.py index 071179a..78aa412 100644 --- a/Tables.py +++ b/Tables.py @@ -1,32 +1,32 @@ -"""This script containing the table definitions for the database.""" - -from DbSetup import Base -from sqlalchemy import Column, Date, Integer, Float, String - -class Toots(Base): - __tablename__ = 'Toots' - __table_args__ = {'extend_existing': True} - index = Column(Integer, primary_key=True) - model = Column(String(30)) - datetime = Column(Date) - language = Column(String(3)) - sentiment = Column(String(8)) - tootId = Column(String(255)) - toot = Column(String(600)) - userName = Column(String(255)) - userId = Column(String(255)) - -class SentimentCounts(Base): - __tablename__ = 'SentimentCounts' - __table_args__ = {'extend_existing': True} - index = Column(Integer, primary_key=True) - sentimentCount = Column(Integer) - date = Column(Date, primary_key=True) - sentiment = Column(String(8)) - -class SentimentMeans(Base): - __tablename__ = 'SentimentMeans' - __table_args__ = {'extend_existing': True} - index = Column(Integer, primary_key=True) - date = Column(Date, primary_key=True) +from DbSetup import Base +from sqlalchemy import Column, Date, Integer, Float, String + +class Toots(Base): + __tablename__ = 'Toots' + __table_args__ = {'extend_existing': True} + index = Column(Integer, primary_key=True) + model = Column(String(30)) + datetime = Column(Date) + language = Column(String(3)) + sentiment = Column(String(8)) + tootId = Column(String(255)) + toot = Column(String(600)) + userName = Column(String(255)) + userId = Column(String(255)) + + + +class SentimentCounts(Base): + __tablename__ = 'SentimentCounts' + __table_args__ = {'extend_existing': True} + index = Column(Integer, primary_key=True) + sentimentCount = Column(Integer) + date = Column(Date, primary_key=True) + sentiment = Column(String(8)) + +class SentimentMeans(Base): + __tablename__ = 'SentimentMeans' + __table_args__ = {'extend_existing': True} + index = Column(Integer, primary_key=True) + date = Column(Date, primary_key=True) SentimentsMean = Column(Float) \ No newline at end of file diff --git a/TootCrawler.py b/TootCrawler.py index 67cd722..1b081c2 100644 --- a/TootCrawler.py +++ b/TootCrawler.py @@ -1,111 +1,48 @@ -from langdetect import detect -import pytz -import pandas as pd -from pandas import DataFrame -import re -from SentiTooter import SentiTooter -from pprint import pprint - -class TootCrawler(): - """Class to fetch the recent toots from fedihum.org.""" - - def __init__(self, mastodonInstance: any) -> None: - """Initialize the Mastodon instance and depending classes. - - Parameters - ------ - mastodonInstance: any - The initialized Mastodon instance. - """ - self.mastodonInstance = mastodonInstance - self.compilePattern = re.compile('<.*?>') - self.sentiTooter = SentiTooter() - self.localTimezone = pytz.timezone('Europe/Berlin') - - def getLocalTimeline(self, minId=None) -> any: - """Receave the local timeline - - Parameters - ------ - minId: str | None - The last fetched toot id from the database. - - Returns - ------ - any - The local Mastodon timeline from fedihum.org. - """ - return self.mastodonInstance.timeline_local(min_id=minId, limit=500) - - def cleanhtml(self, raw_html:str) -> str: - """remove brackets and http string from toots - - Parameters - ------ - raw_html: str - The toot content. - Returns - ------ - str: - The cleaned toot content. - """ - cleantext = re.sub(self.compilePattern, '', raw_html) - cleantext = re.sub(r'http\S+', '', cleantext) - return cleantext - - def buildTootsDataframe(self, minId=None) -> DataFrame: - """Parse fetched toots from Mastodon to dataframe. - - Parameters - ------ - minId: str | None - The id of the last fetched toot. - - Returns - ------ - DataFrame - A Dataframe containing - sentiment: str - The sentiment (positive, neutral, negative) - model: str - The used sentiment model. - toot: str - The content of the toot. - datetime: datetime - The datetime of the toot. - language: str - The langage flag of the toot. - userName: str. - The user name of the toot. - userId: str - The user id. - tootId: str - The toot id. - """ - toots = [] - allTimelineResults = [] - timelinePagination = self.getLocalTimeline(minId) - - while timelinePagination: - allTimelineResults = allTimelineResults + timelinePagination - timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination) - for i in allTimelineResults: - content = self.cleanhtml(i.content) - try: - language = detect(content) - except: - language = None - sentiment = self.sentiTooter.analyze(language, content) - toot = { - "sentiment": sentiment[0], - "model": sentiment[1], - "toot": content, - "datetime": i.created_at.astimezone(self.localTimezone), - "language": language, - "userName": i.account.display_name, - "userId": i.account.id, - "tootId": i.id - } - toots.append(toot) - toots.sort(key=lambda item:item.get('datetime')) +from langdetect import detect +import pytz +import pandas as pd +import re +from SentiTooter import SentiTooter +from pprint import pprint + +class TootCrawler(): + + def __init__(self, mastodonInstance) -> None: + self.mastodonInstance = mastodonInstance + self.compilePattern = re.compile('<.*?>') + self.sentiTooter = SentiTooter() + self.localTimezone = pytz.timezone('Europe/Berlin') + + def getLocalTimeline(self, minId=None): + return self.mastodonInstance.timeline_local(min_id=minId, limit=500) + + def cleanhtml(self, raw_html): + cleantext = re.sub(self.compilePattern, '', raw_html) + cleantext = re.sub(r'http\S+', '', cleantext) + return cleantext + + def buildTootsDataframe(self, minId=None): + toots = [] + allTimelineResults = [] + timelinePagination = self.getLocalTimeline(minId) + + while timelinePagination: + allTimelineResults = allTimelineResults + timelinePagination + timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination) + for i in allTimelineResults: + content = self.cleanhtml(i.content) + language = detect(content) + sentiment = self.sentiTooter.analyze(language, content) + toot = { + "sentiment": sentiment[0], + "model": sentiment[1], + "toot": content, + "datetime": i.created_at.astimezone(self.localTimezone), + "language": language, + "userName": i.account.display_name, + "userId": i.account.id, + "tootId": i.id + } + toots.append(toot) + toots.sort(key=lambda item:item.get('datetime')) return pd.DataFrame.from_records(toots) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index bc6906e..842b542 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,12 +3,6 @@ matplotlib pandas sqlalchemy vader-multi -langdetect numpy pytz -transformers -wheel -germansentiment -scipy -deep_translator -spacy \ No newline at end of file +transformers \ No newline at end of file