hedonodon/TootCrawler.py

from langdetect import detect
import pytz
import pandas as pd
import re
from SentiTooter import SentiTooter
from pprint import pprint

class TootCrawler():

    def __init__(self, mastodonInstance) -> None:
        self.mastodonInstance = mastodonInstance
        self.compilePattern = re.compile('<.*?>')
        self.sentiTooter = SentiTooter()
        self.localTimezone = pytz.timezone('Europe/Berlin')

    def getLocalTimeline(self, minId=None):
        return self.mastodonInstance.timeline_local(min_id=minId, limit=500)

    def cleanhtml(self, raw_html):
        cleantext = re.sub(self.compilePattern, '', raw_html)
        cleantext = re.sub(r'http\S+', '', cleantext)
        return cleantext

    def buildTootsDataframe(self, minId=None):
        toots = []
        allTimelineResults = []
        timelinePagination = self.getLocalTimeline(minId)

        while timelinePagination:
            allTimelineResults = allTimelineResults + timelinePagination
            timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination)
        for i in allTimelineResults:
            content = self.cleanhtml(i.content)
            language = detect(content)
            sentiment = self.sentiTooter.analyze(language, content)
            toot = {
                "sentiment": sentiment[0],
                "model": sentiment[1],
                "toot": content,
                "datetime": i.created_at.astimezone(self.localTimezone),
                "language": language,
                "userName": i.account.display_name,
                "userId": i.account.id,
                "tootId": i.id
            }
            toots.append(toot)
        toots.sort(key=lambda item:item.get('datetime'))
        return pd.DataFrame.from_records(toots)