Now using language dependent senti analizier. no compound score anymore.

2023-01-05 01:43:11 +01:00 · 2023-01-05 01:43:11 +01:00 · a20f7331bb
commit a20f7331bb
parent f0d4eadf28
8 changed files with 153 additions and 72 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,4 +4,8 @@ instance
 __pycache__
 hedonodon_clientcred.secret
 hedonodon_usercred.secret
-.fleet
+.fleet
+test.py
+.idea
+cardiffnlp
+venv
--- a/CRUDManager.py
+++ b/CRUDManager.py
@ -3,6 +3,35 @@ import pandas as pd
 from sqlalchemy import desc, select
 from Tables import Toots

+
+def calculateSentimentCount():
+    query = f'''SELECT DATE(datetime) as date, sentiment, COUNT(sentiment) as sentimentCount
+                FROM Toots
+                GROUP BY DATE(datetime),
+                sentiment
+                HAVING datetime >= DATE("now","-1 day")
+                AND datetime < DATE("now")'''
+    return pd.read_sql(
+        query,
+        databaseUrl,
+        parse_dates=["datetime"]
+    )
+
+def calculateSentimentMean(dataframe):
+    negativeSentimentSum = dataframe[dataframe['sentiment'] == 'negative']['sentimentCount'].sum() * -1
+    positiveSentimentSum = dataframe[dataframe['sentiment'] == 'positive']['sentimentCount'].sum()
+    sentimentSum = dataframe['sentimentCount'].sum()
+    sentimentMean = (negativeSentimentSum + positiveSentimentSum) / sentimentSum
+    sentimentDate = dataframe.loc[0]['date']
+    return pd.DataFrame.from_records(
+        [
+            {
+                'date': sentimentDate,
+                'sentimentsMean': sentimentMean
+            }
+        ]
+    )
+
 class CRUDManager():

    def saveToDatabase(self, dataframe, table:str, useIndex=False):
@ -16,21 +45,4 @@ class CRUDManager():

    def getLastToot(self):
        stmt = select(Toots.tootId).order_by(desc('datetime'))
-        return session.scalars(stmt).first()
-
-    def calculateAggregates(self, column, aggregate='Count'):
-        if (aggregate=='Count'):
-            addGroup = f', {column} '
-        else:
-            addGroup = ''
-        query = f'''SELECT DATE(datetime) as date {addGroup}, {aggregate}({column}) as {column}{aggregate}
-                    FROM Toots
-                    GROUP BY DATE(datetime)''' \
-                    + addGroup \
-                    + '''HAVING datetime >= DATE("now","-1 day")
-                    AND datetime < DATE("now")'''
-        return pd.read_sql(
-            query,
-            databaseUrl,
-            parse_dates=["datetime"]
-            )
+        return session.scalars(stmt).first()
--- a/Main.py
+++ b/Main.py
@ -1,12 +1,10 @@
-from CRUDManager import CRUDManager
+from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean
 from datetime import datetime, date
 from DbSetup import init_db
 import locale
 from MastodonAccountManager import MastodonAccountManager
 import matplotlib.pyplot as plt
 import matplotlib.dates as mdates
-from matplotlib.ticker import MultipleLocator
-import numpy as np
 from TootCrawler import TootCrawler

 locale.setlocale(locale.LC_TIME, "en_EN.UTF-8")
@ -27,31 +25,38 @@ crudManager = CRUDManager()

 lastTootId = crudManager.getLastToot()
 tootsDataframe = tootCrawler.buildTootsDataframe(lastTootId)
-sentimentsYesterday = crudManager.calculateAggregates('sentiment', 'Count')
+
+if not tootsDataframe.empty:
+    crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False)
+else:
+    print('Nothing changed since last database insert!')
+
+sentimentsYesterday = calculateSentimentCount()
+sentimentMeansYesterday = calculateSentimentMean(sentimentsYesterday)
+
+if not tootsDataframe.empty:
+    crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='SentimentCounts', useIndex=True)
+    crudManager.saveToDatabase(dataframe=sentimentMeansYesterday, table='SentimentMeans', useIndex=True)
+else:
+    print('Nothing changed since last database insert!')

 colormap = {
-    'negative"': '#ff9999',
+    'negative': '#ff9999',
    'neutral': '#ffcc99',
    "positive": '#99ff99'
 }

 todaysColors = []
 for sentiment in sentimentsYesterday['sentiment'].to_numpy():
-     todaysColors.append(colormap[sentiment])
+    todaysColors.append(colormap[sentiment])

-compoundsYesterday = crudManager.calculateAggregates('compound', 'Avg')
-if not tootsDataframe.empty:
-     crudManager.saveToDatabase(tootsDataframe, 'Toots', useIndex=False)
-     crudManager.saveToDatabase(dataframe=sentimentsYesterday, table='Sentiments', useIndex=True)
-     crudManager.saveToDatabase(dataframe=compoundsYesterday, table='Compounds', useIndex=True)
-else:
-     print('Nothing changed since last database insert!')

-TodayDate= datetime.strptime(sentimentsYesterday['date'][0], '%Y-%m-%d').strftime('%d.%m.%Y')
+
+TodayDate = datetime.strptime(sentimentsYesterday['date'][0], '%Y-%m-%d').strftime('%d.%m.%Y')
 dataframe4PieChart = sentimentsYesterday.drop('date', axis=1).set_index('sentiment')
-dataframe4LineChart = crudManager.loadFromDatabase('Compounds', 'date').drop('index', axis=1)
+dataframe4LineChart = crudManager.loadFromDatabase('SentimentMeans', 'date').drop('index', axis=1)

-fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10,10))
+fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 10))

 # Pie chart.
 pieChartlabels = dataframe4PieChart.index.to_numpy()
@ -61,24 +66,22 @@ pieChart = dataframe4PieChart.plot.pie(
    ylabel="",
    labels=dataframe4PieChart['sentimentCount'],
    title=f'Moods of the toots on {TodayDate} of the local timeline on fedihum.org',
-    colors = todaysColors,
+    colors=todaysColors,
    wedgeprops=dict(linewidth=3, edgecolor='w'),
    startangle=90
 )

 axes[0].axis('equal')
-centre_circle = plt.Circle((0,0),0.6,fc='white')
+centre_circle = plt.Circle((0, 0), 0.6, fc='white')
 axes[0].add_patch(centre_circle)
 chartBox = axes[0].get_position()
-axes[0].set_position([chartBox.x0,chartBox.y0-0.2,chartBox.width,chartBox.height])
-axes[0].legend(pieChartlabels,loc='upper right', bbox_to_anchor=(0.8, 0.9))
-
+axes[0].legend(pieChartlabels, loc='upper right', bbox_to_anchor=(0.9, 0.9))

 # Line chart.
 lineChart = dataframe4LineChart.plot.line(
    ax=axes[1],
-    title='Compounds from max positive (1) to min negative (-1)'
-    )
+    title='Mean of all sentiments from max positive (1) to min negative (-1)'
+)
 axes[1].grid(True)
 axes[1].set_xlim([date(2023, 1, 1), date(2023, 12, 31)])
 axes[1].set_ylim([-1, 1])
@ -88,8 +91,9 @@ axes[1].xaxis.set_major_formatter(plt.NullFormatter())
 axes[1].xaxis.set_minor_formatter(mdates.DateFormatter('%h'))
 axes[1].tick_params(which='minor', length=0)
 plotFileUrl = f'./plots/{TodayDate}.png'
-plt.show()
 plt.savefig(plotFileUrl)

+"""
 media = mastodonInstance.media_post(plotFileUrl, mime_type="image/png", description=f"Sentiment analysis of local timeline on fedihum.org, showing the moods of the toots on, and the compounds up to {TodayDate}.")
 mastodonInstance.status_post(f'The moods of the toots on and up to {TodayDate}.', media_ids=media, language='en')
+"""
--- a/MastodonAccountManager.py
+++ b/MastodonAccountManager.py
@ -2,4 +2,4 @@ from mastodon import Mastodon

 class MastodonAccountManager():
    def __init__(self):
-        self.instance =  Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')
+        self.instance = Mastodon(client_id = 'hedonodon_clientcred.secret', access_token = 'hedonodon_usercred.secret')
--- a/SentiTooter.py
+++ b/SentiTooter.py
@ -1,19 +1,74 @@
-from math import sqrt
-from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+from germansentiment import SentimentModel
 import numpy as np
+from scipy.special import softmax
+from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

-class SentiTooter():
+
+# Preprocess text (username and link placeholders)
+def preprocess(text):
+    new_text = []
+
+    for t in text.split(" "):
+        t = '@user' if t.startswith('@') and len(t) > 1 else t
+        t = 'http' if t.startswith('http') else t
+        new_text.append(t)
+    return " ".join(new_text)
+
+
+class SentiTooter:
    """"""
+
    def __init__(self):
+        self.deModel = SentimentModel()
+        self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
+        self.enModel, self.enTokenizer = self.initModel()
+        # https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt
+        self.labels = ['negative', 'neutral', 'positive']
        self.sia = SentimentIntensityAnalyzer()

-
    def analyze(self, toot):
-        compound = self.sia.polarity_scores(toot.content)['compound']
-        if (compound > (1/3)):
-            return ['positive', compound]
-        elif (compound < (-1/3)):
-            return ['negative', compound]
-        else:
-            return ['neutral', compound]
+        match toot.language:
+            case 'de':
+                sentiment = self.deModel.predict_sentiment([toot.content])
+                sentiment.append('germanSentiment')
+                return sentiment
+            case 'en':
+                text = preprocess(toot.content)
+                encoded_input = self.enTokenizer(text, return_tensors='pt')
+                output = self.enModel(**encoded_input)
+                scores = output[0][0].detach().numpy()
+                scores = softmax(scores)
+                sentimentIndexWithMaxScore = np.argmax(scores)
+                sentimentLabel = self.labels[sentimentIndexWithMaxScore]
+                sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment']
+                return sentiment
+            case _:
+                compound = self.sia.polarity_scores(toot.content)['compound']
+                if compound > (1 / 3):
+                    return ['positive', 'vaderSentiment']
+                elif compound < (-1 / 3):
+                    return ['negative', 'vaderSentiment']
+                else:
+                    return ['neutral', 'vaderSentiment']

+
+
+    def initModel(self):
+        # PT
+        tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
+        tokenizer.save_pretrained(self.enModelType)
+        model = AutoModelForSequenceClassification.from_pretrained(self.enModelType)
+        model.save_pretrained(self.enModelType)
+        return model, tokenizer
+
+    # # TF
+    # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
+    # model.save_pretrained(MODEL)
+
+    # text = "Good night 😊"
+    # encoded_input = tokenizer(text, return_tensors='tf')
+    # output = model(encoded_input)
+    # scores = output[0][0].numpy()
+    # scores = softmax(scores)
--- a/Tables.py
+++ b/Tables.py
@ -5,7 +5,7 @@ class Toots(Base):
    __tablename__ = 'Toots'
    __table_args__ = {'extend_existing': True}
    index = Column(Integer, primary_key=True)
-    compound =  Column(Float)
+    model = Column(String(30))
    datetime = Column(Date)
    language = Column(String(3))
    sentiment = Column(String(8))
@ -16,18 +16,17 @@ class Toots(Base):



-class Sentiments(Base):
-    __tablename__ = 'Sentiments'
+class SentimentCounts(Base):
+    __tablename__ = 'SentimentCounts'
    __table_args__ = {'extend_existing': True}
    index = Column(Integer, primary_key=True)
    sentimentCount = Column(Integer)
-    date = Column(Date, primary_key =  True)
+    date = Column(Date, primary_key=True)
    sentiment = Column(String(8))

-
-class Compounds(Base):
-    __tablename__ = 'Compounds'
+class SentimentMeans(Base):
+    __tablename__ = 'SentimentMeans'
    __table_args__ = {'extend_existing': True}
    index = Column(Integer, primary_key=True)
-    date = Column(Date, primary_key =  True)
-    compoundAvg = Column(Float)
+    date = Column(Date, primary_key=True)
+    SentimentsMean = Column(Float)
--- a/TootCrawler.py
+++ b/TootCrawler.py
@ -1,10 +1,10 @@
+from langdetect import detect
 import pytz
 import pandas as pd
 import re
 from SentiTooter import SentiTooter
 from pprint import pprint

-
 class TootCrawler():

    def __init__(self, mastodonInstance) -> None:
@ -13,29 +13,34 @@ class TootCrawler():
        self.sentiTooter = SentiTooter()
        self.localTimezone = pytz.timezone('Europe/Berlin')

-    def getLocalTimeline(self,  sinceId=None):
-        return  self.mastodonInstance.timeline_local(since_id=sinceId)
+    def getLocalTimeline(self, minId=None):
+        return self.mastodonInstance.timeline_local(min_id=minId, limit=500)

    def cleanhtml(self, raw_html):
        cleantext = re.sub(self.compilePattern, '', raw_html)
        cleantext = re.sub(r'http\S+', '', cleantext)
        return cleantext

-    def buildTootsDataframe(self, sinceId=None):
+    def buildTootsDataframe(self, minId=None):
        toots = []
+        allTimelineResults = []
+        timelinePagination = self.getLocalTimeline(minId)

-        for i in self.getLocalTimeline(sinceId):
+        while timelinePagination:
+            allTimelineResults = allTimelineResults + timelinePagination
+            timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination)
+        for i in allTimelineResults:
            content = self.cleanhtml(i.content)
            sentiment = self.sentiTooter.analyze(i)
            toots.append(
                    {
                            "sentiment": sentiment[0],
-                            "compound": sentiment[1],
+                            "model": sentiment[1],
                            "userName": i.account.display_name,
                            "userId": i.account.id,
                            "toot": content,
                            "datetime": i.created_at.astimezone(self.localTimezone),
-                            "language": i.language,
+                            "language": detect(content),
                            "tootId": i.id
                    }
                )
--- a/requirements.txt
+++ b/requirements.txt
@ -3,4 +3,6 @@ matplotlib
 pandas
 sqlalchemy
 vader-multi
-numpy
+numpy
+pytz
+transformers