add code documentation

This commit is contained in:
rnsrk 2023-03-17 20:06:01 +01:00
parent 4479bd2429
commit bc842244c7
7 changed files with 261 additions and 31 deletions

View file

@ -1,4 +1,5 @@
from germansentiment import SentimentModel
from pandas import DataFrame
import numpy as np
from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
@ -9,7 +10,18 @@ import spacy
from collections import Counter
# Preprocess text (username and link placeholders)
def preprocess(text):
def preprocess(text:str) -> str:
"""Removes tags and urls from text.
Parameters
------
text: str
The raw toot from Mastodon.
Returns
------
str
The cleaned text.
"""
new_text = []
for t in text.split(" "):
@ -20,9 +32,12 @@ def preprocess(text):
class SentiTooter:
""""""
"""Class to analyze the toots.
"""
def __init__(self):
"""Initilize the sentiment models and labels.
"""
self.deModel = SentimentModel()
self.enModelType = f"cardiffnlp/twitter-roberta-base-sentiment"
self.enModel, self.enTokenizer = self.initModel()
@ -30,7 +45,20 @@ class SentiTooter:
self.labels = ['negative', 'neutral', 'positive']
self.sia = SentimentIntensityAnalyzer()
def analyze(self, language, content):
def analyze(self, language:str, content:str) -> list[str, str, float]:
"""Analyzes the sentiments of the toots.
Parameters
------
language: str
The language tag of the toot.
content: str
The toot content.
Returns
------
list[str, str, float]
A list with the sentiment, analyzer type, and sentiment score.
"""
match language:
case 'de':
sentimentList, probabilitiesList = self.deModel.predict_sentiment([content], output_probabilities=True)
@ -61,6 +89,13 @@ class SentiTooter:
def initModel(self):
"""Initialize the english models.
Returns
------
tupel
The pretrained model and tokenizer.
"""
# PT
tokenizer = AutoTokenizer.from_pretrained(self.enModelType)
tokenizer.save_pretrained(self.enModelType)
@ -68,7 +103,14 @@ class SentiTooter:
model.save_pretrained(self.enModelType)
return model, tokenizer
def translateToots(yesterdaysToots):
def translateToots(yesterdaysToots:DataFrame) -> DataFrame:
"""Translates all toots to english.
Returns
------
Dataframe
Containing the english translated toots.
"""
yesterdaysTootsTranslated = yesterdaysToots
for index, row in yesterdaysTootsTranslated.iterrows():
if (row['language'] != 'en'):
@ -79,11 +121,39 @@ def translateToots(yesterdaysToots):
yesterdaysTootsTranslated.drop(index)
return yesterdaysTootsTranslated
def translateToot(language, toot):
def translateToot(language:str, toot:str) -> str:
"""Translate a toot in english.
Parameters
------
language:str
The language of the toot.
toot: str
The toot content.
Returns
------
str
The in english translated toot.
"""
content = preprocess(toot)
return GoogleTranslator(source=language, target='en').translate(content)
def countWords(concatedToots, count):
def countWords(concatedToots: str, number: int) -> list:
"""Counts the word frequencies in all toots of a given sentiment.
Parameters
------
concatedToots: str
All toots from a sentiment.
number: int
Number of words to calculate word frequencies.
Returns
------
list
List containing tuple of word and word frequency.
"""
nlp = spacy.load('en_core_web_md')
doc = nlp(concatedToots)
@ -96,18 +166,30 @@ def countWords(concatedToots, count):
# five most common noun tokens
noun_freq = Counter(nouns)
return noun_freq.most_common(count)
return noun_freq.most_common(number)
def createWordCountPerSentiment(translatedToots):
def createWordFrequenciesPerSentiment(translatedToots:DataFrame) -> str:
"""Count all word frequencies of all toots per sentiment.
Paramters
------
translatedToots: DataFrame
The dataframe with all toots in english.
Returns
------
str
Containing words and wourd counts per sentiment.
"""
sentimentList = []
for sentiment in ['positive', 'neutral', 'negative']:
tootsSeries = translatedToots[translatedToots['sentiment'] == sentiment].toot
wordCounts = countWords(tootsSeries.str.cat(sep=' '), 5)
countList = []
for count in wordCounts:
countList.append(str(count[0]) + ' (' + str(count[1]) + ')')
list2String = ', '.join(countList)
wordFrequencies = countWords(tootsSeries.str.cat(sep=' '), 5)
FrequenciesList = []
for Frequencies in wordFrequencies:
FrequenciesList.append(str(Frequencies[0]) + ' (' + str(Frequencies[1]) + ')')
list2String = ', '.join(FrequenciesList)
sentimentString = sentiment + ': ' + list2String
sentimentList.append(sentimentString)
wordCountsPerSentiments = '\n'.join(sentimentList)
return wordCountsPerSentiments
wordFrequenciessPerSentiments = '\n'.join(sentimentList)
return wordFrequenciessPerSentiments