111 lines
No EOL
3.5 KiB
Python
111 lines
No EOL
3.5 KiB
Python
from langdetect import detect
|
|
import pytz
|
|
import pandas as pd
|
|
from pandas import DataFrame
|
|
import re
|
|
from SentiTooter import SentiTooter
|
|
from pprint import pprint
|
|
|
|
class TootCrawler():
|
|
"""Class to fetch the recent toots from fedihum.org."""
|
|
|
|
def __init__(self, mastodonInstance: any) -> None:
|
|
"""Initialize the Mastodon instance and depending classes.
|
|
|
|
Parameters
|
|
------
|
|
mastodonInstance: any
|
|
The initialized Mastodon instance.
|
|
"""
|
|
self.mastodonInstance = mastodonInstance
|
|
self.compilePattern = re.compile('<.*?>')
|
|
self.sentiTooter = SentiTooter()
|
|
self.localTimezone = pytz.timezone('Europe/Berlin')
|
|
|
|
def getLocalTimeline(self, minId=None) -> any:
|
|
"""Receave the local timeline
|
|
|
|
Parameters
|
|
------
|
|
minId: str | None
|
|
The last fetched toot id from the database.
|
|
|
|
Returns
|
|
------
|
|
any
|
|
The local Mastodon timeline from fedihum.org.
|
|
"""
|
|
return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
|
|
|
|
def cleanhtml(self, raw_html:str) -> str:
|
|
"""remove brackets and http string from toots
|
|
|
|
Parameters
|
|
------
|
|
raw_html: str
|
|
The toot content.
|
|
Returns
|
|
------
|
|
str:
|
|
The cleaned toot content.
|
|
"""
|
|
cleantext = re.sub(self.compilePattern, '', raw_html)
|
|
cleantext = re.sub(r'http\S+', '', cleantext)
|
|
return cleantext
|
|
|
|
def buildTootsDataframe(self, minId=None) -> DataFrame:
|
|
"""Parse fetched toots from Mastodon to dataframe.
|
|
|
|
Parameters
|
|
------
|
|
minId: str | None
|
|
The id of the last fetched toot.
|
|
|
|
Returns
|
|
------
|
|
DataFrame
|
|
A Dataframe containing
|
|
sentiment: str
|
|
The sentiment (positive, neutral, negative)
|
|
model: str
|
|
The used sentiment model.
|
|
toot: str
|
|
The content of the toot.
|
|
datetime: datetime
|
|
The datetime of the toot.
|
|
language: str
|
|
The langage flag of the toot.
|
|
userName: str.
|
|
The user name of the toot.
|
|
userId: str
|
|
The user id.
|
|
tootId: str
|
|
The toot id.
|
|
"""
|
|
toots = []
|
|
allTimelineResults = []
|
|
timelinePagination = self.getLocalTimeline(minId)
|
|
|
|
while timelinePagination:
|
|
allTimelineResults = allTimelineResults + timelinePagination
|
|
timelinePagination = self.mastodonInstance.fetch_previous(timelinePagination)
|
|
for i in allTimelineResults:
|
|
content = self.cleanhtml(i.content)
|
|
try:
|
|
language = detect(content)
|
|
except:
|
|
language = None
|
|
sentiment = self.sentiTooter.analyze(language, content)
|
|
toot = {
|
|
"sentiment": sentiment[0],
|
|
"model": sentiment[1],
|
|
"toot": content,
|
|
"datetime": i.created_at.astimezone(self.localTimezone),
|
|
"language": language,
|
|
"userName": i.account.display_name,
|
|
"userId": i.account.id,
|
|
"tootId": i.id
|
|
}
|
|
toots.append(toot)
|
|
toots.sort(key=lambda item:item.get('datetime'))
|
|
return pd.DataFrame.from_records(toots) |