Now using language dependent senti analizier. no compound score anymore.

This commit is contained in:
rnsrk 2023-01-05 01:43:11 +01:00
parent f0d4eadf28
commit a20f7331bb
8 changed files with 153 additions and 72 deletions

View file

@ -1,19 +1,74 @@
from math import sqrt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from germansentiment import SentimentModel
import numpy as np
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
class SentiTooter():
# Preprocess text (username and link placeholders)
def preprocess(text):
new_text = []
for t in text.split(" "):
t = '@user' if t.startswith('@') and len(t) > 1 else t
t = 'http' if t.startswith('http') else t
new_text.append(t)
return " ".join(new_text)
class SentiTooter:
""""""
def __init__(self):
self.deModel = SentimentModel()
self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
self.enModel, self.enTokenizer = self.initModel()
# https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt
self.labels = ['negative', 'neutral', 'positive']
self.sia = SentimentIntensityAnalyzer()
def analyze(self, toot):
compound = self.sia.polarity_scores(toot.content)['compound']
if (compound > (1/3)):
return ['positive', compound]
elif (compound < (-1/3)):
return ['negative', compound]
else:
return ['neutral', compound]
match toot.language:
case 'de':
sentiment = self.deModel.predict_sentiment([toot.content])
sentiment.append('germanSentiment')
return sentiment
case 'en':
text = preprocess(toot.content)
encoded_input = self.enTokenizer(text, return_tensors='pt')
output = self.enModel(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
sentimentIndexWithMaxScore = np.argmax(scores)
sentimentLabel = self.labels[sentimentIndexWithMaxScore]
sentiment = [sentimentLabel, 'twitter-roberta-base-sentiment']
return sentiment
case _:
compound = self.sia.polarity_scores(toot.content)['compound']
if compound > (1 / 3):
return ['positive', 'vaderSentiment']
elif compound < (-1 / 3):
return ['negative', 'vaderSentiment']
else:
return ['neutral', 'vaderSentiment']
def initModel(self):
# PT
tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
tokenizer.save_pretrained(self.enModelType)
model = AutoModelForSequenceClassification.from_pretrained(self.enModelType)
model.save_pretrained(self.enModelType)
return model, tokenizer
# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
# text = "Good night 😊"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)