43 lines
No EOL
1.5 KiB
Python
43 lines
No EOL
1.5 KiB
Python
import pytz
|
|
import pandas as pd
|
|
import re
|
|
from SentiTooter import SentiTooter
|
|
from pprint import pprint
|
|
|
|
|
|
class TootCrawler():
|
|
|
|
def __init__(self, mastodonInstance) -> None:
|
|
self.mastodonInstance = mastodonInstance
|
|
self.compilePattern = re.compile('<.*?>')
|
|
self.sentiTooter = SentiTooter()
|
|
self.localTimezone = pytz.timezone('Europe/Berlin')
|
|
|
|
def getLocalTimeline(self, sinceId=None):
|
|
return self.mastodonInstance.timeline_local(since_id=sinceId)
|
|
|
|
def cleanhtml(self, raw_html):
|
|
cleantext = re.sub(self.compilePattern, '', raw_html)
|
|
cleantext = re.sub(r'http\S+', '', cleantext)
|
|
return cleantext
|
|
|
|
def buildTootsDataframe(self, sinceId=None):
|
|
toots = []
|
|
|
|
for i in self.getLocalTimeline(sinceId):
|
|
content = self.cleanhtml(i.content)
|
|
sentiment = self.sentiTooter.analyze(i)
|
|
toots.append(
|
|
{
|
|
"sentiment": sentiment[0],
|
|
"compound": sentiment[1],
|
|
"userName": i.account.display_name,
|
|
"userId": i.account.id,
|
|
"toot": content,
|
|
"datetime": i.created_at.astimezone(self.localTimezone),
|
|
"language": i.language,
|
|
"tootId": i.id
|
|
}
|
|
)
|
|
toots.sort(key=lambda item:item.get('datetime'))
|
|
return pd.DataFrame.from_records(toots) |