From 6a8caac29efed1bde02575f6804ec650d3fe02f4 Mon Sep 17 00:00:00 2001 From: rnsrk Date: Wed, 15 Mar 2023 14:27:07 +0100 Subject: [PATCH] implement rough wordcount --- CRUDManager.py | 8 ++++---- Main.py | 8 ++++++-- SentiTooter.py | 38 +++++++++++++++++++++++++++++++++++--- requirements.txt | 2 ++ 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/CRUDManager.py b/CRUDManager.py index ea7e7e5..dccdf00 100644 --- a/CRUDManager.py +++ b/CRUDManager.py @@ -31,14 +31,14 @@ def calculateSentimentMean(dataframe): ] ) -def calculateWordCount(): - query = f'''SELECT DATE(datetime) as date, language, sentiment, toot +def getYesterdaysToots(): + query = f'''SELECT datetime as date, language, sentiment, toot FROM Toots WHERE datetime >= DATE("now","-1 day") AND datetime < DATE("now")''' return pd.read_sql( - query, - engine, + sql.text(query), + connection, parse_dates=["datetime"] ) diff --git a/Main.py b/Main.py index 56ba6b7..2af6e60 100644 --- a/Main.py +++ b/Main.py @@ -1,4 +1,4 @@ -from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, calculateWordCount +from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots from datetime import datetime, date from DbSetup import init_db import locale @@ -6,6 +6,7 @@ from MastodonAccountManager import MastodonAccountManager import matplotlib.pyplot as plt import matplotlib.dates as mdates from TootCrawler import TootCrawler +from SentiTooter import translateToots, countWords locale.setlocale(locale.LC_TIME, "en_US.UTF-8") init_db() @@ -31,7 +32,10 @@ if not tootsDataframe.empty: else: print('Nothing changed since last database insert!') -wordCounts = calculateWordCount() +yesterdaysToots = getYesterdaysToots() +translatedToots = translateToots(yesterdaysToots) +tootsSeries = translatedToots.toot +wordCounts = countWords(tootsSeries.str.cat(sep=' '), 10) print(wordCounts); print("exit programm") exit() diff --git a/SentiTooter.py b/SentiTooter.py index 6aa1f92..d5f22ef 100644 --- a/SentiTooter.py +++ b/SentiTooter.py @@ -4,15 +4,17 @@ from scipy.special import softmax from transformers import AutoModelForSequenceClassification from transformers import AutoTokenizer from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer - +from deep_translator import GoogleTranslator +import spacy +from collections import Counter # Preprocess text (username and link placeholders) def preprocess(text): new_text = [] for t in text.split(" "): - t = '@user' if t.startswith('@') and len(t) > 1 else t - t = 'http' if t.startswith('http') else t + t = '' if t.startswith('@') and len(t) > 1 else t + t = '' if t.startswith('http') else t new_text.append(t) return " ".join(new_text) @@ -65,3 +67,33 @@ class SentiTooter: model = AutoModelForSequenceClassification.from_pretrained(self.enModelType) model.save_pretrained(self.enModelType) return model, tokenizer + +def translateToots(yesterdaysToots): + yesterdaysTootsTranslated = yesterdaysToots + for index, row in yesterdaysTootsTranslated.iterrows(): + if (row['language'] != 'de'): + try: + yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot']) + yesterdaysTootsTranslated.at[index,'language'] = 'de' + except: + yesterdaysTootsTranslated.drop(index) + return yesterdaysTootsTranslated + +def translateToot(language, toot): + content = preprocess(toot) + return GoogleTranslator(source=language, target='de').translate(content) + +def countWords(concatedToots, count): + nlp = spacy.load('de_core_news_sm') + doc = nlp(concatedToots) + + # noun tokens that arent stop words or punctuations + nouns = [token.text + for token in doc + if (not token.is_stop and + not token.is_punct and + token.pos_ == "NOUN")] + + # five most common noun tokens + noun_freq = Counter(nouns) + return noun_freq.most_common(count) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 2cf3aab..bc6906e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,5 @@ transformers wheel germansentiment scipy +deep_translator +spacy \ No newline at end of file