add code documentation
This commit is contained in:
parent
4479bd2429
commit
bc842244c7
7 changed files with 261 additions and 31 deletions
112
SentiTooter.py
112
SentiTooter.py
|
|
@ -1,4 +1,5 @@
|
|||
from germansentiment import SentimentModel
|
||||
from pandas import DataFrame
|
||||
import numpy as np
|
||||
from scipy.special import softmax
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
|
@ -9,7 +10,18 @@ import spacy
|
|||
from collections import Counter
|
||||
|
||||
# Preprocess text (username and link placeholders)
|
||||
def preprocess(text):
|
||||
def preprocess(text:str) -> str:
|
||||
"""Removes tags and urls from text.
|
||||
|
||||
Parameters
|
||||
------
|
||||
text: str
|
||||
The raw toot from Mastodon.
|
||||
Returns
|
||||
------
|
||||
str
|
||||
The cleaned text.
|
||||
"""
|
||||
new_text = []
|
||||
|
||||
for t in text.split(" "):
|
||||
|
|
@ -20,9 +32,12 @@ def preprocess(text):
|
|||
|
||||
|
||||
class SentiTooter:
|
||||
""""""
|
||||
"""Class to analyze the toots.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initilize the sentiment models and labels.
|
||||
"""
|
||||
self.deModel = SentimentModel()
|
||||
self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
|
||||
self.enModel, self.enTokenizer = self.initModel()
|
||||
|
|
@ -30,7 +45,20 @@ class SentiTooter:
|
|||
self.labels = ['negative', 'neutral', 'positive']
|
||||
self.sia = SentimentIntensityAnalyzer()
|
||||
|
||||
def analyze(self, language, content):
|
||||
def analyze(self, language:str, content:str) -> list[str, str, float]:
|
||||
"""Analyzes the sentiments of the toots.
|
||||
|
||||
Parameters
|
||||
------
|
||||
language: str
|
||||
The language tag of the toot.
|
||||
content: str
|
||||
The toot content.
|
||||
Returns
|
||||
------
|
||||
list[str, str, float]
|
||||
A list with the sentiment, analyzer type, and sentiment score.
|
||||
"""
|
||||
match language:
|
||||
case 'de':
|
||||
sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
|
||||
|
|
@ -61,6 +89,13 @@ class SentiTooter:
|
|||
|
||||
|
||||
def initModel(self):
|
||||
"""Initialize the english models.
|
||||
|
||||
Returns
|
||||
------
|
||||
tupel
|
||||
The pretrained model and tokenizer.
|
||||
"""
|
||||
# PT
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
|
||||
tokenizer.save_pretrained(self.enModelType)
|
||||
|
|
@ -68,7 +103,14 @@ class SentiTooter:
|
|||
model.save_pretrained(self.enModelType)
|
||||
return model, tokenizer
|
||||
|
||||
def translateToots(yesterdaysToots):
|
||||
def translateToots(yesterdaysToots:DataFrame) -> DataFrame:
|
||||
"""Translates all toots to english.
|
||||
|
||||
Returns
|
||||
------
|
||||
Dataframe
|
||||
Containing the english translated toots.
|
||||
"""
|
||||
yesterdaysTootsTranslated = yesterdaysToots
|
||||
for index, row in yesterdaysTootsTranslated.iterrows():
|
||||
if (row['language'] != 'en'):
|
||||
|
|
@ -79,11 +121,39 @@ def translateToots(yesterdaysToots):
|
|||
yesterdaysTootsTranslated.drop(index)
|
||||
return yesterdaysTootsTranslated
|
||||
|
||||
def translateToot(language, toot):
|
||||
def translateToot(language:str, toot:str) -> str:
|
||||
"""Translate a toot in english.
|
||||
|
||||
Parameters
|
||||
------
|
||||
language:str
|
||||
The language of the toot.
|
||||
toot: str
|
||||
The toot content.
|
||||
|
||||
Returns
|
||||
------
|
||||
str
|
||||
The in english translated toot.
|
||||
"""
|
||||
content = preprocess(toot)
|
||||
return GoogleTranslator(source=language, target='en').translate(content)
|
||||
|
||||
def countWords(concatedToots, count):
|
||||
def countWords(concatedToots: str, number: int) -> list:
|
||||
"""Counts the word frequencies in all toots of a given sentiment.
|
||||
|
||||
Parameters
|
||||
------
|
||||
concatedToots: str
|
||||
All toots from a sentiment.
|
||||
number: int
|
||||
Number of words to calculate word frequencies.
|
||||
|
||||
Returns
|
||||
------
|
||||
list
|
||||
List containing tuple of word and word frequency.
|
||||
"""
|
||||
nlp = spacy.load('en_core_web_md')
|
||||
doc = nlp(concatedToots)
|
||||
|
||||
|
|
@ -96,18 +166,30 @@ def countWords(concatedToots, count):
|
|||
|
||||
# five most common noun tokens
|
||||
noun_freq = Counter(nouns)
|
||||
return noun_freq.most_common(count)
|
||||
return noun_freq.most_common(number)
|
||||
|
||||
def createWordCountPerSentiment(translatedToots):
|
||||
def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str:
|
||||
"""Count all word frequencies of all toots per sentiment.
|
||||
|
||||
Paramters
|
||||
------
|
||||
translatedToots: DataFrame
|
||||
The dataframe with all toots in english.
|
||||
|
||||
Returns
|
||||
------
|
||||
str
|
||||
Containing words and wourd counts per sentiment.
|
||||
"""
|
||||
sentimentList = []
|
||||
for sentiment in ['positive', 'neutral', 'negative']:
|
||||
tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot
|
||||
wordCounts = countWords(tootsSeries.str.cat(sep=' '), 5)
|
||||
countList = []
|
||||
for count in wordCounts:
|
||||
countList.append(str(count[0]) + ' (' + str(count[1]) + ')')
|
||||
list2String = ', '.join(countList)
|
||||
wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5)
|
||||
FrequenciesList = []
|
||||
for Frequencies in wordFrequencies:
|
||||
FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')')
|
||||
list2String = ', '.join(FrequenciesList)
|
||||
sentimentString = sentiment + ': ' + list2String
|
||||
sentimentList.append(sentimentString)
|
||||
wordCountsPerSentiments = '\n'.join(sentimentList)
|
||||
return wordCountsPerSentiments
|
||||
wordFrequenciessPerSentiments = '\n'.join(sentimentList)
|
||||
return wordFrequenciessPerSentiments
|
||||
Loading…
Add table
Add a link
Reference in a new issue