implement rough wordcount

This commit is contained in:
rnsrk 2023-03-15 14:27:07 +01:00
parent 09fd313a89
commit 6a8caac29e
4 changed files with 47 additions and 9 deletions

View file

@ -31,14 +31,14 @@ def calculateSentimentMean(dataframe):
] ]
) )
def calculateWordCount(): def getYesterdaysToots():
query = f'''SELECT DATE(datetime) as date, language, sentiment, toot query = f'''SELECT datetime as date, language, sentiment, toot
FROM Toots FROM Toots
WHERE datetime >= DATE("now","-1 day") WHERE datetime >= DATE("now","-1 day")
AND datetime < DATE("now")''' AND datetime < DATE("now")'''
return pd.read_sql( return pd.read_sql(
query, sql.text(query),
engine, connection,
parse_dates=["datetime"] parse_dates=["datetime"]
) )

View file

@ -1,4 +1,4 @@
from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, calculateWordCount from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots
from datetime import datetime, date from datetime import datetime, date
from DbSetup import init_db from DbSetup import init_db
import locale import locale
@ -6,6 +6,7 @@ from MastodonAccountManager import MastodonAccountManager
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.dates as mdates import matplotlib.dates as mdates
from TootCrawler import TootCrawler from TootCrawler import TootCrawler
from SentiTooter import translateToots, countWords
locale.setlocale(locale.LC_TIME, "en_US.UTF-8") locale.setlocale(locale.LC_TIME, "en_US.UTF-8")
init_db() init_db()
@ -31,7 +32,10 @@ if not tootsDataframe.empty:
else: else:
print('Nothing changed since last database insert!') print('Nothing changed since last database insert!')
wordCounts = calculateWordCount() yesterdaysToots = getYesterdaysToots()
translatedToots = translateToots(yesterdaysToots)
tootsSeries = translatedToots.toot
wordCounts = countWords(tootsSeries.str.cat(sep=' '), 10)
print(wordCounts); print(wordCounts);
print("exit programm") print("exit programm")
exit() exit()

View file

@ -4,15 +4,17 @@ from scipy.special import softmax
from transformers import AutoModelForSequenceClassification from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from deep_translator import GoogleTranslator
import spacy
from collections import Counter
# Preprocess text (username and link placeholders) # Preprocess text (username and link placeholders)
def preprocess(text): def preprocess(text):
new_text = [] new_text = []
for t in text.split(" "): for t in text.split(" "):
t = '@user' if t.startswith('@') and len(t) > 1 else t t = '' if t.startswith('@') and len(t) > 1 else t
t = 'http' if t.startswith('http') else t t = '' if t.startswith('http') else t
new_text.append(t) new_text.append(t)
return " ".join(new_text) return " ".join(new_text)
@ -65,3 +67,33 @@ class SentiTooter:
model = AutoModelForSequenceClassification.from_pretrained(self.enModelType) model = AutoModelForSequenceClassification.from_pretrained(self.enModelType)
model.save_pretrained(self.enModelType) model.save_pretrained(self.enModelType)
return model, tokenizer return model, tokenizer
def translateToots(yesterdaysToots):
yesterdaysTootsTranslated = yesterdaysToots
for index, row in yesterdaysTootsTranslated.iterrows():
if (row['language'] != 'de'):
try:
yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot'])
yesterdaysTootsTranslated.at[index,'language'] = 'de'
except:
yesterdaysTootsTranslated.drop(index)
return yesterdaysTootsTranslated
def translateToot(language, toot):
content = preprocess(toot)
return GoogleTranslator(source=language, target='de').translate(content)
def countWords(concatedToots, count):
nlp = spacy.load('de_core_news_sm')
doc = nlp(concatedToots)
# noun tokens that arent stop words or punctuations
nouns = [token.text
for token in doc
if (not token.is_stop and
not token.is_punct and
token.pos_ == "NOUN")]
# five most common noun tokens
noun_freq = Counter(nouns)
return noun_freq.most_common(count)

View file

@ -10,3 +10,5 @@ transformers
wheel wheel
germansentiment germansentiment
scipy scipy
deep_translator
spacy