Now using language dependent senti analizier. no compound score anymore.

2023-01-05 01:43:11 +01:00 · 2023-01-05 01:43:11 +01:00 · a20f7331bb
commit a20f7331bb
parent f0d4eadf28
8 changed files with 153 additions and 72 deletions
--- a/SentiTooter.py
+++ b/SentiTooter.py
@ -1,19 +1,74 @@
-from math import sqrt
-from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+from germansentiment import SentimentModel
 import numpy as np
+from scipy.special import softmax
+from transformers import AutoModelForSequenceClassification
+from transformers import AutoTokenizer
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

-class SentiTooter():
+
+# Preprocess text (username and link placeholders)
+def preprocess(text):
+    new_text = []
+
+    for t in text.split(" "):
+        t = '@user' if t.startswith('@') and len(t) > 1 else t
+        t = 'http' if t.startswith('http') else t
+        new_text.append(t)
+    return " ".join(new_text)
+
+
+class SentiTooter:
    """"""
+
    def __init__(self):
+        self.deModel = SentimentModel()
+        self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
+        self.enModel, self.enTokenizer = self.initModel()
+        # https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt
+        self.labels = ['negative', 'neutral', 'positive']
        self.sia = SentimentIntensityAnalyzer()

-
    def analyze(self, toot):
-        compound = self.sia.polarity_scores(toot.content)['compound']
-        if (compound > (1/3)):
-            return ['positive', compound]
-        elif (compound < (-1/3)):
-            return ['negative', compound]
-        else:
-            return ['neutral', compound]
+        match toot.language:
+            case 'de':
+                sentiment = self.deModel.predict_sentiment([toot.content])
+                sentiment.append('germanSentiment')
+                return sentiment
+            case 'en':
+                text = preprocess(toot.content)
+                encoded_input = self.enTokenizer(text, return_tensors='pt')
+                output = self.enModel(**encoded_input)
+                scores = output[0][0].detach().numpy()
+                scores = softmax(scores)
+                sentimentIndexWithMaxScore = np.argmax(scores)
+                sentimentLabel = self.labels[sentimentIndexWithMaxScore]
+                sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment']
+                return sentiment
+            case _:
+                compound = self.sia.polarity_scores(toot.content)['compound']
+                if compound > (1 / 3):
+                    return ['positive', 'vaderSentiment']
+                elif compound < (-1 / 3):
+                    return ['negative', 'vaderSentiment']
+                else:
+                    return ['neutral', 'vaderSentiment']

+
+
+    def initModel(self):
+        # PT
+        tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
+        tokenizer.save_pretrained(self.enModelType)
+        model = AutoModelForSequenceClassification.from_pretrained(self.enModelType)
+        model.save_pretrained(self.enModelType)
+        return model, tokenizer
+
+    # # TF
+    # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
+    # model.save_pretrained(MODEL)
+
+    # text = "Good night 😊"
+    # encoded_input = tokenizer(text, return_tensors='tf')
+    # output = model(encoded_input)
+    # scores = output[0][0].numpy()
+    # scores = softmax(scores)