Fixed some typos

Updated the README
take the large spacy model
2023-03-17 21:29:16 +01:00 · 2023-03-17 21:26:14 +01:00 · 2023-03-17 21:25:44 +01:00 · 2023-03-17 20:06:01 +01:00 · 2023-03-15 16:02:47 +01:00 · 2023-03-15 14:27:07 +01:00
10 changed files with 473 additions and 149 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,11 +1,12 @@
-database.db
-plots
-instance
-__pycache__
-hedonodon_clientcred.secret
-hedonodon_usercred.secret
-.fleet
-test.py
-.idea
-cardiffnlp
-venv
+database.db
+plots
+instance
+__pycache__
+hedonodon_clientcred.secret
+hedonodon_usercred.secret
+.fleet
+test.py
+.idea
+cardiffnlp
+venv
+logs.txt
--- a/CRUDManager.py
+++ b/CRUDManager.py
@ -1,10 +1,21 @@
-from DbSetup import engine, session, databaseUrl
+from DbSetup import connection, engine, session, databaseUrl
 import pandas as pd
-from sqlalchemy import desc, select
+from sqlalchemy import desc, select, sql
 from Tables import Toots

+from pandas.core.api import (
+    DataFrame)

 def calculateSentimentCount():
+    """Calculates the frequencies of the sentiments.
+
+    Returns
+    -------
+    DataFrame
+        Containing date (YY-MM-DD), sentiment (positive, neutral, negative),
+        and sentimentCount.
+    """
+
    query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount
                FROM Toots
                GROUP BY DATE(datetime),
@ -12,12 +23,23 @@ def calculateSentimentCount():
                HAVING datetime >= DATE("now","-1 day")
                AND datetime < DATE("now")'''
    return pd.read_sql(
-        query,
-        databaseUrl,
+        sql.text(query),
+        connection,
        parse_dates=["datetime"]
    )

-def calculateSentimentMean(dataframe):
+def calculateSentimentMean(dataframe:DataFrame) -> DataFrame:
+    """Calculates the mean of the sentiments.
+
+    Parameters
+    -------
+        dataframe: DataFrame
+
+    Returns
+    -------
+        Dataframe
+        Containing date (YY-MM-DD), sentimentsMean.
+    """
    negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1
    positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum()
    sentimentSum = dataframe['sentimentCount'].sum()
@ -32,17 +54,68 @@ def calculateSentimentMean(dataframe):
        ]
    )

-class CRUDManager():
+def getYesterdaysToots() -> DataFrame:
+    """Fetches yesterdays toots from database.

-    def saveToDatabase(self, dataframe, table:str, useIndex=False):
+    Returns
+    -------
+        pd.Dataframe
+        Containing date (YY-MM-DD), language, sentiment, toot.
+    """
+    query = f'''SELECT datetime as date, language, sentiment, toot
+                FROM Toots
+                WHERE datetime >= DATE("now","-1 day")
+                AND datetime < DATE("now")'''
+    return pd.read_sql(
+        sql.text(query),
+        connection,
+        parse_dates=["datetime"]
+    )
+
+class CRUDManager():
+    """Class for database operations"""
+
+    def saveToDatabase(self, dataframe:DataFrame, table:str, useIndex=False):
+        """Saves dataframe to database.
+
+        Parameters
+        -------
+            dataframe: DataFrame
+                Input dataframe.
+            table: str
+                Table, where to save the data.
+            useIndex: boolean
+                Should the index of the dataframe be used as index for
+                the database table?
+        """
        try:
            dataframe.to_sql(table, engine, index=useIndex, if_exists="append")
        except:
            print(f'Could not save data to {table}!')

-    def loadFromDatabase(self, table:str, indexColumn=None):
-        return pd.read_sql_table(table, databaseUrl, index_col=indexColumn)
+    def loadFromDatabase(self, table:str, indexColumn=None) -> DataFrame:
+        """Load a table into a dataframe.

-    def getLastToot(self):
+        Parameters
+        -------
+            table: str
+                Table, where to save the data.
+            indexColumn: str | None
+                Should the index of the table be used as index for
+                the dataframe?
+        Returns
+        -------
+            DataFrame
+        """
+        return pd.read_sql_table(table, connection, index_col=indexColumn)
+
+    def getLastToot(self) -> str:
+        """Query the last toot id from database.
+
+        Results
+        -------
+            str
+            A toot id.
+        """
        stmt = select(Toots.tootId).order_by(desc('datetime'))
-        return session.scalars(stmt).first()
+        return session.scalars(stmt).first()
--- a/DbSetup.py
+++ b/DbSetup.py
@ -1,11 +1,18 @@
-from sqlalchemy import create_engine
-from sqlalchemy.orm import Session
-from sqlalchemy.ext.declarative import declarative_base
-
-databaseUrl = 'sqlite:///database.db'
-engine = create_engine(databaseUrl, future=True)
-session = Session(engine)
-Base = declarative_base()
-
-def init_db():
-     Base.metadata.create_all(bind=engine)
+"""Script to initialize the database.
+     Serves database url, engine, connection and session.
+"""
+
+from sqlalchemy import create_engine
+from sqlalchemy.orm import Session
+from sqlalchemy.ext.declarative import declarative_base
+
+databaseUrl = 'sqlite:///database.db'
+engine = create_engine(databaseUrl, future=True)
+connection = engine.connect()
+session = Session(engine)
+Base = declarative_base()
+
+def init_db():
+     """Initialize the database.
+     """
+     Base.metadata.create_all(bind=engine)
--- a/Main.py
+++ b/Main.py
@ -1,4 +1,20 @@
-from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean
+"""
+Hedonodon toot sentiment analyzer.
+
+This programm fetches toots from the fedihum.org Mastodon instance, calculates
+the frequencies of the sentiments (positive, neutral, negative) and the mean
+from these nominal values (even this is not statistical correct (;-_-)!, but
+not all analyzer return compounds).
+It also calculates the word count of the nouns per sentiment.
+
+It uses germansentiment for german toots, twitter-roberta-base-sentiment for
+english toots, and vaderSentiment for other languages.
+
+For the word counts I translate the toots to english with the GoogleTranslator
+first.
+"""
+
+from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots
 from datetime import datetime, date
 from DbSetup import init_db
 import locale
@ -6,10 +22,12 @@ from MastodonAccountManager import MastodonAccountManager
 import matplotlib.pyplot as plt
 import matplotlib.dates as mdates
 from TootCrawler import TootCrawler
+from SentiTooter import translateToots, createWordFrequenciesPerSentiment

-locale.setlocale(locale.LC_TIME, "en_EN.UTF-8")
+locale.setlocale(locale.LC_TIME, "en_US.UTF-8")
 init_db()

+print('Initialize Mastodon...')
 mastodonAccountManager = MastodonAccountManager()
 mastodonInstance = mastodonAccountManager.instance
 """
@ -19,27 +37,47 @@ mastodonInstance.log_in(
    to_file = 'hedonodon_usercred.secret'
 )
 """
+print('done!')

+print('Fetching recent toots...')
 tootCrawler = TootCrawler(mastodonInstance)
 crudManager = CRUDManager()
-
 lastTootId = crudManager.getLastToot()
 tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId)
-exit()
+print('done!')
+
+print('Save toots to database...')
 if not tootsDataframe.empty:
    crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False)
+    print('done!')
 else:
-    print('Nothing changed since last database insert!')
+    print('nothing changed since last database insert!')

+print('Calculate word counts...')
+yesterdaysToots = getYesterdaysToots()
+translatedToots = translateToots(yesterdaysToots)
+wordCountsPerSentiment = createWordFrequenciesPerSentiment(translatedToots)
+print('done!')
+
+print(wordCountsPerSentiment);
+
+print('Calculate sentiment counts...')
 sentimentsYesterday = calculateSentimentCount()
-sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday)
+print('done!')

+print('Calculate sentiment mean...')
+sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday)
+print('done!')
+
+print('Save calculations to database...')
 if not tootsDataframe.empty:
    crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True)
    crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True)
+    print('done!')
 else:
-    print('Nothing changed since last database insert!')
+    print('nothing changed since last database insert!')

+print('Create figure...')
 colormap = {
    'negative': '#ff9999',
    'neutral': '#ffcc99',
@ -80,7 +118,7 @@ axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9))
 # Line chart.
 lineChart = dataframe4LineChart.plot.line(
    ax=axes[1],
-    title='Mean of all sentiments from max positive (1) to min negative (-1)'
+    title='"Mean" of sentiments, calculated from nominal values, pos(1), neu (0), neg (-1)!'
 )
 axes[1].grid(True)
 axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)])
@ -92,7 +130,9 @@ axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h'))
 axes[1].tick_params(which='minor', length=0)
 plotFileUrl = f'./plots/{TodayDate}.png'
 plt.savefig(plotFileUrl)
+print('done!')

+print('Send toot...')
 #media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the sentiment mean up to {TodayDate}.")
-#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en')
-
+#mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.\nWord counts per sentiment:\n{wordCountsPerSentiment}', media_ids=media, language='en')
+print('done!')
--- a/MastodonAccountManager.py
+++ b/MastodonAccountManager.py
@ -1,5 +1,7 @@
-from mastodon import Mastodon
-
-class MastodonAccountManager():
-    def __init__(self):
-        self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')
+from mastodon import Mastodon
+
+class MastodonAccountManager():
+    """Initialize the Mastodon account.
+    """
+    def __init__(self):
+        self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')
--- a/README.md
+++ b/README.md
@ -1,4 +1,19 @@
-# Hedonodon
-I'm using [vaderSentiment](https://pypi.org/project/vaderSentiment/) to calculate the compounds.
-
-More Documentation coming soon!
+# Hedonodon
+## Prerequisites
+Install the dependencies with `python -m pip install -r requirements.txt`.
+Install SpaCys nlp model with `python -m spacy download en_core_web_lg`.
+If the automatic download of the twitter-roberta-base-sentiment model and tokenizer fail, go to the model pages on hugging face (see models section) and download the to the respective folder (cardiffnlp/twitter-roberta-base-sentiment).
+
+## Purpose
+Hedonodon fetch toots from fedihum.org and calculates the sentiments, sentiment mean and word frequencies of each day, and creates fancy diagrams from the data.
+
+## Motivation
+This tool was created to understand how sentiment analyses and nlp methods works, so it may lacks of proper use of models etc...
+
+## Models
+It uses "germansentiment"](https://huggingface.co/oliverguhr/german-sentiment-bert) for german toots, []"twitter-roberta-base-sentiment"](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment) for
+english toots, and ["vaderSentiment"](https://pypi.org/project/vaderSentiment/)  for other languages.
+For the word counts I translate the toots to english with the GoogleTranslator from [deep_translater](https://pypi.org/project/deep-translator/) first and then use SpaCys nlp model ["en_core_web_lg"](https://spacy.io/models/en/) to calculate the word frequencies.
+
+## Weaknesses
+Since some moduls do not return sentiment compounds I have to use the nominal sentiment values (positive, neutral, negative) to calculate the mean of the day, which is statisticaly not okay (;-_-).
--- a/SentiTooter.py
+++ b/SentiTooter.py
@ -1,26 +1,43 @@
 from germansentiment import SentimentModel
+from pandas import DataFrame
 import numpy as np
 from scipy.special import softmax
 from transformers import AutoModelForSequenceClassification
 from transformers import AutoTokenizer
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
-
+from deep_translator import GoogleTranslator
+import spacy
+from collections import Counter

 # Preprocess text (username and link placeholders)
-def preprocess(text):
+def preprocess(text:str) -> str:
+    """Removes tags and urls from text.
+
+    Parameters
+    ------
+        text: str
+        The raw toot from Mastodon.
+    Returns
+    ------
+        str
+        The cleaned text.
+    """
    new_text = []

    for t in text.split(" "):
-        t = '@user' if t.startswith('@') and len(t) > 1 else t
-        t = 'http' if t.startswith('http') else t
+        t = '' if t.startswith('@') and len(t) > 1 else t
+        t = '' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)


 class SentiTooter:
-    """"""
+    """Class to analyze the toots.
+    """

    def __init__(self):
+        """Initilize the sentiment models and labels.
+        """
        self.deModel = SentimentModel()
        self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
        self.enModel, self.enTokenizer = self.initModel()
@ -28,7 +45,20 @@ class SentiTooter:
        self.labels = ['negative', 'neutral', 'positive']
        self.sia = SentimentIntensityAnalyzer()

-    def analyze(self, language, content):
+    def analyze(self, language:str, content:str) -> list[str, str, float]:
+        """Analyzes the sentiments of the toots.
+
+        Parameters
+        ------
+            language: str
+            The language tag of the toot.
+            content: str
+            The toot content.
+        Returns
+        ------
+            list[str, str, float]
+            A list with the sentiment, analyzer type, and sentiment score.
+        """
        match language:
            case 'de':
                sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
@ -41,15 +71,15 @@ class SentiTooter:
                output = self.enModel(**encoded_input)
                scores = output[0][0].detach().numpy()
                scores = softmax(scores)
-                print(scores)
+                #print(scores)
                sentimentIndexWithMaxScore = np.argmax(scores)
                sentimentLabel = self.labels[sentimentIndexWithMaxScore]
                sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment', max(scores)]
-                print(sentiment)
+                #print(sentiment)
                return sentiment
            case _:
                compound = self.sia.polarity_scores(content)['compound']
-                print(self.sia.polarity_scores(content), 'vaderSentiment')
+                #print(self.sia.polarity_scores(content), 'vaderSentiment')
                if compound > (1 / 3):
                    return ['positive', 'vaderSentiment']
                elif compound < (-1 / 3):
@ -58,8 +88,14 @@ class SentiTooter:
                    return ['neutral', 'vaderSentiment']


-
    def initModel(self):
+        """Initialize the english models.
+
+        Returns
+        ------
+            tupel
+                The pretrained model and tokenizer.
+        """
        # PT
        tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
        tokenizer.save_pretrained(self.enModelType)
@ -67,12 +103,93 @@ class SentiTooter:
        model.save_pretrained(self.enModelType)
        return model, tokenizer

-    # # TF
-    # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
-    # model.save_pretrained(MODEL)
+def translateToots(yesterdaysToots:DataFrame) -> DataFrame:
+    """Translates all toots to english.

-    # text = "Good night 😊"
-    # encoded_input = tokenizer(text, return_tensors='tf')
-    # output = model(encoded_input)
-    # scores = output[0][0].numpy()
-    # scores = softmax(scores)
+    Returns
+    ------
+        Dataframe
+        Containing the english translated toots.
+    """
+    yesterdaysTootsTranslated = yesterdaysToots
+    for index, row in yesterdaysTootsTranslated.iterrows():
+        if (row['language'] != 'en'):
+            try:
+                yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot'])
+                yesterdaysTootsTranslated.at[index,'language'] = 'en'
+            except:
+                yesterdaysTootsTranslated.drop(index)
+    return yesterdaysTootsTranslated
+
+def translateToot(language:str, toot:str) -> str:
+    """Translate a toot in english.
+
+    Parameters
+    ------
+        language:str
+        The language of the toot.
+        toot: str
+        The toot content.
+
+    Returns
+    ------
+        str
+        The in english translated toot.
+    """
+    content = preprocess(toot)
+    return GoogleTranslator(source=language, target='en').translate(content)
+
+def countWords(concatedToots: str, number: int) -> list:
+    """Counts the word frequencies in all toots of a given sentiment.
+
+    Parameters
+    ------
+        concatedToots: str
+        All toots from a sentiment.
+        number: int
+        Number of words to calculate word frequencies.
+
+    Returns
+    ------
+        list
+        List containing tuple of word and word frequency.
+    """
+    nlp = spacy.load('en_core_web_lg')
+    doc = nlp(concatedToots)
+
+    # noun tokens that arent stop words or punctuations
+    nouns = [token.text
+            for token in doc
+            if (not token.is_stop and
+                not token.is_punct and
+                token.pos_ == "NOUN")]
+
+    # five most common noun tokens
+    noun_freq = Counter(nouns)
+    return noun_freq.most_common(number)
+
+def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str:
+    """Count all word frequencies of all toots per sentiment.
+
+    Paramters
+    ------
+        translatedToots: DataFrame
+        The dataframe with all toots in english.
+
+    Returns
+    ------
+        str
+        Containing words and wourd counts per sentiment.
+    """
+    sentimentList = []
+    for sentiment in ['positive', 'neutral', 'negative']:
+        tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot
+        wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5)
+        FrequenciesList = []
+        for Frequencies in wordFrequencies:
+             FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')')
+        list2String = ', '.join(FrequenciesList)
+        sentimentString = sentiment + ': ' + list2String
+        sentimentList.append(sentimentString)
+    wordFrequenciessPerSentiments = '\n'.join(sentimentList)
+    return wordFrequenciessPerSentiments
--- a/Tables.py
+++ b/Tables.py
@ -1,32 +1,32 @@
-from DbSetup import Base
-from sqlalchemy import Column, Date, Integer, Float, String
-
-class Toots(Base):
-    __tablename__ = 'Toots'
-    __table_args__ = {'extend_existing': True}
-    index = Column(Integer, primary_key=True)
-    model = Column(String(30))
-    datetime = Column(Date)
-    language = Column(String(3))
-    sentiment = Column(String(8))
-    tootId = Column(String(255))
-    toot = Column(String(600))
-    userName = Column(String(255))
-    userId = Column(String(255))
-
-
-
-class SentimentCounts(Base):
-    __tablename__ = 'SentimentCounts'
-    __table_args__ = {'extend_existing': True}
-    index = Column(Integer, primary_key=True)
-    sentimentCount = Column(Integer)
-    date = Column(Date, primary_key=True)
-    sentiment = Column(String(8))
-
-class SentimentMeans(Base):
-    __tablename__ = 'SentimentMeans'
-    __table_args__ = {'extend_existing': True}
-    index = Column(Integer, primary_key=True)
-    date = Column(Date, primary_key=True)
+"""This script containing the table definitions for the database."""
+
+from DbSetup import Base
+from sqlalchemy import Column, Date, Integer, Float, String
+
+class Toots(Base):
+    __tablename__ = 'Toots'
+    __table_args__ = {'extend_existing': True}
+    index = Column(Integer, primary_key=True)
+    model = Column(String(30))
+    datetime = Column(Date)
+    language = Column(String(3))
+    sentiment = Column(String(8))
+    tootId = Column(String(255))
+    toot = Column(String(600))
+    userName = Column(String(255))
+    userId = Column(String(255))
+
+class SentimentCounts(Base):
+    __tablename__ = 'SentimentCounts'
+    __table_args__ = {'extend_existing': True}
+    index = Column(Integer, primary_key=True)
+    sentimentCount = Column(Integer)
+    date = Column(Date, primary_key=True)
+    sentiment = Column(String(8))
+
+class SentimentMeans(Base):
+    __tablename__ = 'SentimentMeans'
+    __table_args__ = {'extend_existing': True}
+    index = Column(Integer, primary_key=True)
+    date = Column(Date, primary_key=True)
    SentimentsMean = Column(Float)
--- a/TootCrawler.py
+++ b/TootCrawler.py
@ -1,48 +1,111 @@
-from langdetect import detect
-import pytz
-import pandas as pd
-import re
-from SentiTooter import SentiTooter
-from pprint import pprint
-
-class TootCrawler():
-
-    def __init__(self, mastodonInstance) -> None:
-        self.mastodonInstance = mastodonInstance
-        self.compilePattern = re.compile('<.*?>')
-        self.sentiTooter = SentiTooter()
-        self.localTimezone = pytz.timezone('Europe/Berlin')
-
-    def getLocalTimeline(self, minId=None):
-        return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
-
-    def cleanhtml(self, raw_html):
-        cleantext = re.sub(self.compilePattern, '', raw_html)
-        cleantext = re.sub(r'http\S+', '', cleantext)
-        return cleantext
-
-    def buildTootsDataframe(self, minId=None):
-        toots = []
-        allTimelineResults = []
-        timelinePagination = self.getLocalTimeline(minId)
-
-        while timelinePagination:
-            allTimelineResults = allTimelineResults + timelinePagination
-            timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination)
-        for i in allTimelineResults:
-            content = self.cleanhtml(i.content)
-            language = detect(content)
-            sentiment = self.sentiTooter.analyze(language, content)
-            toot = {
-                "sentiment": sentiment[0],
-                "model": sentiment[1],
-                "toot": content,
-                "datetime": i.created_at.astimezone(self.localTimezone),
-                "language": language,
-                "userName": i.account.display_name,
-                "userId": i.account.id,
-                "tootId": i.id
-            }
-            toots.append(toot)
-        toots.sort(key=lambda item:item.get('datetime'))
+from langdetect import detect
+import pytz
+import pandas as pd
+from pandas import DataFrame
+import re
+from SentiTooter import SentiTooter
+from pprint import pprint
+
+class TootCrawler():
+    """Class to fetch the recent toots from fedihum.org."""
+
+    def __init__(self, mastodonInstance: any) -> None:
+        """Initialize the Mastodon instance and depending classes.
+
+        Parameters
+        ------
+            mastodonInstance: any
+                The initialized Mastodon instance.
+        """
+        self.mastodonInstance = mastodonInstance
+        self.compilePattern = re.compile('<.*?>')
+        self.sentiTooter = SentiTooter()
+        self.localTimezone = pytz.timezone('Europe/Berlin')
+
+    def getLocalTimeline(self, minId=None) -> any:
+        """Receave the local timeline
+
+        Parameters
+        ------
+            minId: str | None
+                The last fetched toot id from the database.
+
+        Returns
+        ------
+            any
+                The local Mastodon timeline from fedihum.org.
+        """
+        return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
+
+    def cleanhtml(self, raw_html:str) -> str:
+        """remove brackets and http string from toots
+
+        Parameters
+        ------
+            raw_html: str
+            The toot content.
+        Returns
+        ------
+            str:
+            The cleaned toot content.
+        """
+        cleantext = re.sub(self.compilePattern, '', raw_html)
+        cleantext = re.sub(r'http\S+', '', cleantext)
+        return cleantext
+
+    def buildTootsDataframe(self, minId=None) -> DataFrame:
+        """Parse fetched toots from Mastodon to dataframe.
+
+        Parameters
+        ------
+            minId: str | None
+            The id of the last fetched toot.
+
+        Returns
+        ------
+            DataFrame
+            A Dataframe containing
+            sentiment: str
+                The sentiment (positive, neutral, negative)
+            model: str
+                The used sentiment model.
+            toot: str
+                The content of the toot.
+            datetime: datetime
+                The datetime of the toot.
+            language: str
+                The langage flag of the toot.
+            userName: str.
+                The user name of the toot.
+            userId: str
+                The user id.
+            tootId: str
+                The toot id.
+        """
+        toots = []
+        allTimelineResults = []
+        timelinePagination = self.getLocalTimeline(minId)
+
+        while timelinePagination:
+            allTimelineResults = allTimelineResults + timelinePagination
+            timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination)
+        for i in allTimelineResults:
+            content = self.cleanhtml(i.content)
+            try:
+                language = detect(content)
+            except:
+                language = None
+            sentiment = self.sentiTooter.analyze(language, content)
+            toot = {
+                "sentiment": sentiment[0],
+                "model": sentiment[1],
+                "toot": content,
+                "datetime": i.created_at.astimezone(self.localTimezone),
+                "language": language,
+                "userName": i.account.display_name,
+                "userId": i.account.id,
+                "tootId": i.id
+            }
+            toots.append(toot)
+        toots.sort(key=lambda item:item.get('datetime'))
        return pd.DataFrame.from_records(toots)
--- a/requirements.txt
+++ b/requirements.txt
@ -3,6 +3,12 @@ matplotlib
 pandas
 sqlalchemy
 vader-multi
+langdetect
 numpy
 pytz
-transformers
+transformers
+wheel
+germansentiment
+scipy
+deep_translator
+spacy
Author	SHA1	Message	Date
rnsrk	03792f2120	Fixed some typos	2023-03-17 21:29:16 +01:00
rnsrk	cafda77e7f	Updated the README	2023-03-17 21:26:14 +01:00
rnsrk	8d9a7fa603	take the large spacy model	2023-03-17 21:25:44 +01:00
rnsrk	bc842244c7	add code documentation	2023-03-17 20:06:01 +01:00
rnsrk	4479bd2429	implement word counts.	2023-03-15 16:02:47 +01:00
rnsrk	6a8caac29e	implement rough wordcount	2023-03-15 14:27:07 +01:00
rnsrk	09fd313a89	Merge branch 'main' into with_cites	2023-03-15 13:25:41 +01:00
rnsrk	3b677e5713	underway to wordcount	2023-03-15 13:21:44 +01:00
Robert Nasarek	8f7c578087	shortend description	2023-03-15 11:16:35 +01:00
Robert Nasarek	79f54079f7	fixed unrecognisable lang bug	2023-01-31 17:51:06 +01:00
Robert Nasarek	2b98565444	made hedonodon server ready	2023-01-27 21:08:25 +01:00