add code documentation

This commit is contained in:
rnsrk 2023-03-17 20:06:01 +01:00
parent 4479bd2429
commit bc842244c7
7 changed files with 261 additions and 31 deletions

View file

@ -1,27 +1,87 @@
from langdetect import detect
import pytz
import pandas as pd
from pandas import DataFrame
import re
from SentiTooter import SentiTooter
from pprint import pprint
class TootCrawler():
"""Class to fetch the recent toots from fedihum.org."""
def __init__(self, mastodonInstance) -> None:
def __init__(self, mastodonInstance: any) -> None:
"""Initialize the Mastodon instance and depending classes.
Parameters
------
mastodonInstance: any
The initialized Mastodon instance.
"""
self.mastodonInstance = mastodonInstance
self.compilePattern = re.compile('<.*?>')
self.sentiTooter = SentiTooter()
self.localTimezone = pytz.timezone('Europe/Berlin')
def getLocalTimeline(self, minId=None):
def getLocalTimeline(self, minId=None) -> any:
"""Receave the local timeline
Parameters
------
minId: str | None
The last fetched toot id from the database.
Returns
------
any
The local Mastodon timeline from fedihum.org.
"""
return self.mastodonInstance.timeline_local(min_id=minId, limit=500)
def cleanhtml(self, raw_html):
def cleanhtml(self, raw_html:str) -> str:
"""remove brackets and http string from toots
Parameters
------
raw_html: str
The toot content.
Returns
------
str:
The cleaned toot content.
"""
cleantext = re.sub(self.compilePattern, '', raw_html)
cleantext = re.sub(r'http\S+', '', cleantext)
return cleantext
def buildTootsDataframe(self, minId=None):
def buildTootsDataframe(self, minId=None) -> DataFrame:
"""Parse fetched toots from Mastodon to dataframe.
Parameters
------
minId: str | None
The id of the last fetched toot.
Returns
------
DataFrame
A Dataframe containing
sentiment: str
The sentiment (positive, neutral, negative)
model: str
The used sentiment model.
toot: str
The content of the toot.
datetime: datetime
The datetime of the toot.
language: str
The langage flag of the toot.
userName: str.
The user name of the toot.
userId: str
The user id.
tootId: str
The toot id.
"""
toots = []
allTimelineResults = []
timelinePagination = self.getLocalTimeline(minId)