implement rough wordcount
This commit is contained in:
parent
09fd313a89
commit
6a8caac29e
4 changed files with 47 additions and 9 deletions
|
|
@ -31,14 +31,14 @@ def calculateSentimentMean(dataframe):
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
def calculateWordCount():
|
def getYesterdaysToots():
|
||||||
query = f'''SELECT DATE(datetime) as date, language, sentiment, toot
|
query = f'''SELECT datetime as date, language, sentiment, toot
|
||||||
FROM Toots
|
FROM Toots
|
||||||
WHERE datetime >= DATE("now","-1 day")
|
WHERE datetime >= DATE("now","-1 day")
|
||||||
AND datetime < DATE("now")'''
|
AND datetime < DATE("now")'''
|
||||||
return pd.read_sql(
|
return pd.read_sql(
|
||||||
query,
|
sql.text(query),
|
||||||
engine,
|
connection,
|
||||||
parse_dates=["datetime"]
|
parse_dates=["datetime"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
8
Main.py
8
Main.py
|
|
@ -1,4 +1,4 @@
|
||||||
from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, calculateWordCount
|
from CRUDManager import CRUDManager, calculateSentimentCount, calculateSentimentMean, getYesterdaysToots
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
from DbSetup import init_db
|
from DbSetup import init_db
|
||||||
import locale
|
import locale
|
||||||
|
|
@ -6,6 +6,7 @@ from MastodonAccountManager import MastodonAccountManager
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import matplotlib.dates as mdates
|
import matplotlib.dates as mdates
|
||||||
from TootCrawler import TootCrawler
|
from TootCrawler import TootCrawler
|
||||||
|
from SentiTooter import translateToots, countWords
|
||||||
|
|
||||||
locale.setlocale(locale.LC_TIME, "en_US.UTF-8")
|
locale.setlocale(locale.LC_TIME, "en_US.UTF-8")
|
||||||
init_db()
|
init_db()
|
||||||
|
|
@ -31,7 +32,10 @@ if not tootsDataframe.empty:
|
||||||
else:
|
else:
|
||||||
print('Nothing changed since last database insert!')
|
print('Nothing changed since last database insert!')
|
||||||
|
|
||||||
wordCounts = calculateWordCount()
|
yesterdaysToots = getYesterdaysToots()
|
||||||
|
translatedToots = translateToots(yesterdaysToots)
|
||||||
|
tootsSeries = translatedToots.toot
|
||||||
|
wordCounts = countWords(tootsSeries.str.cat(sep=' '), 10)
|
||||||
print(wordCounts);
|
print(wordCounts);
|
||||||
print("exit programm")
|
print("exit programm")
|
||||||
exit()
|
exit()
|
||||||
|
|
|
||||||
|
|
@ -4,15 +4,17 @@ from scipy.special import softmax
|
||||||
from transformers import AutoModelForSequenceClassification
|
from transformers import AutoModelForSequenceClassification
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||||||
|
from deep_translator import GoogleTranslator
|
||||||
|
import spacy
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
# Preprocess text (username and link placeholders)
|
# Preprocess text (username and link placeholders)
|
||||||
def preprocess(text):
|
def preprocess(text):
|
||||||
new_text = []
|
new_text = []
|
||||||
|
|
||||||
for t in text.split(" "):
|
for t in text.split(" "):
|
||||||
t = '@user' if t.startswith('@') and len(t) > 1 else t
|
t = '' if t.startswith('@') and len(t) > 1 else t
|
||||||
t = 'http' if t.startswith('http') else t
|
t = '' if t.startswith('http') else t
|
||||||
new_text.append(t)
|
new_text.append(t)
|
||||||
return " ".join(new_text)
|
return " ".join(new_text)
|
||||||
|
|
||||||
|
|
@ -65,3 +67,33 @@ class SentiTooter:
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(self.enModelType)
|
model = AutoModelForSequenceClassification.from_pretrained(self.enModelType)
|
||||||
model.save_pretrained(self.enModelType)
|
model.save_pretrained(self.enModelType)
|
||||||
return model, tokenizer
|
return model, tokenizer
|
||||||
|
|
||||||
|
def translateToots(yesterdaysToots):
|
||||||
|
yesterdaysTootsTranslated = yesterdaysToots
|
||||||
|
for index, row in yesterdaysTootsTranslated.iterrows():
|
||||||
|
if (row['language'] != 'de'):
|
||||||
|
try:
|
||||||
|
yesterdaysTootsTranslated.at[index,'toot'] = translateToot(row['language'], row['toot'])
|
||||||
|
yesterdaysTootsTranslated.at[index,'language'] = 'de'
|
||||||
|
except:
|
||||||
|
yesterdaysTootsTranslated.drop(index)
|
||||||
|
return yesterdaysTootsTranslated
|
||||||
|
|
||||||
|
def translateToot(language, toot):
|
||||||
|
content = preprocess(toot)
|
||||||
|
return GoogleTranslator(source=language, target='de').translate(content)
|
||||||
|
|
||||||
|
def countWords(concatedToots, count):
|
||||||
|
nlp = spacy.load('de_core_news_sm')
|
||||||
|
doc = nlp(concatedToots)
|
||||||
|
|
||||||
|
# noun tokens that arent stop words or punctuations
|
||||||
|
nouns = [token.text
|
||||||
|
for token in doc
|
||||||
|
if (not token.is_stop and
|
||||||
|
not token.is_punct and
|
||||||
|
token.pos_ == "NOUN")]
|
||||||
|
|
||||||
|
# five most common noun tokens
|
||||||
|
noun_freq = Counter(nouns)
|
||||||
|
return noun_freq.most_common(count)
|
||||||
|
|
@ -10,3 +10,5 @@ transformers
|
||||||
wheel
|
wheel
|
||||||
germansentiment
|
germansentiment
|
||||||
scipy
|
scipy
|
||||||
|
deep_translator
|
||||||
|
spacy
|
||||||
Loading…
Add table
Add a link
Reference in a new issue