implement rough wordcount

This commit is contained in:
rnsrk 2023-03-15 14:27:07 +01:00
parent 09fd313a89
commit 6a8caac29e
4 changed files with 47 additions and 9 deletions

View file

@ -31,14 +31,14 @@ def calculateSentimentMean(dataframe):
]
)
def calculateWordCount():
query = f'''SELECT DATE(datetime) as date, language, sentiment, toot
def getYesterdaysToots():
query = f'''SELECT datetime as date, language, sentiment, toot
FROM Toots
WHERE datetime >= DATE("now","-1 day")
AND datetime < DATE("now")'''
return pd.read_sql(
query,
engine,
sql.text(query),
connection,
parse_dates=["datetime"]
)

View file

@ -1,4 +1,4 @@
from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, calculateWordCount
from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots
from datetime import datetime, date
from DbSetup import init_db
import locale
@ -6,6 +6,7 @@ from MastodonAccountManager import MastodonAccountManager
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from TootCrawler import TootCrawler
from SentiTooter import translateToots, countWords
locale.setlocale(locale.LC_TIME, "en_US.UTF-8")
init_db()
@ -31,7 +32,10 @@ if not tootsDataframe.empty:
else:
print('Nothing changed since last database insert!')
wordCounts = calculateWordCount()
yesterdaysToots = getYesterdaysToots()
translatedToots = translateToots(yesterdaysToots)
tootsSeries = translatedToots.toot
wordCounts = countWords(tootsSeries.str.cat(sep=' '), 10)
print(wordCounts);
print("exit programm")
exit()

View file

@ -4,15 +4,17 @@ from scipy.special import softmax
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from deep_translator import GoogleTranslator
import spacy
from collections import Counter
# Preprocess text (username and link placeholders)
def preprocess(text):
new_text = []
for t in text.split(" "):
t = '@user' if t.startswith('@') and len(t) > 1 else t
t = 'http' if t.startswith('http') else t
t = '' if t.startswith('@') and len(t) > 1 else t
t = '' if t.startswith('http') else t
new_text.append(t)
return " ".join(new_text)
@ -65,3 +67,33 @@ class SentiTooter:
model = AutoModelForSequenceClassification.from_pretrained(self.enModelType)
model.save_pretrained(self.enModelType)
return model, tokenizer
def translateToots(yesterdaysToots):
yesterdaysTootsTranslated = yesterdaysToots
for index, row in yesterdaysTootsTranslated.iterrows():
if (row['language'] != 'de'):
try:
yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot'])
yesterdaysTootsTranslated.at[index,'language'] = 'de'
except:
yesterdaysTootsTranslated.drop(index)
return yesterdaysTootsTranslated
def translateToot(language, toot):
content = preprocess(toot)
return GoogleTranslator(source=language, target='de').translate(content)
def countWords(concatedToots, count):
nlp = spacy.load('de_core_news_sm')
doc = nlp(concatedToots)
# noun tokens that arent stop words or punctuations
nouns = [token.text
for token in doc
if (not token.is_stop and
not token.is_punct and
token.pos_ == "NOUN")]
# five most common noun tokens
noun_freq = Counter(nouns)
return noun_freq.most_common(count)

View file

@ -10,3 +10,5 @@ transformers
wheel
germansentiment
scipy
deep_translator
spacy