2023-03-16 09:25:24 +01:00
#!/usr/bin/env python3
from datetime import datetime , timedelta
from dateutil import parser
import itertools
import json
import os
import re
import sys
import requests
import time
import argparse
2023-03-21 12:56:34 +01:00
import uuid
2023-03-16 09:25:24 +01:00
2023-03-19 14:37:55 +01:00
argparser = argparse . ArgumentParser ( )
2023-03-16 09:25:24 +01:00
2023-03-19 14:37:55 +01:00
argparser . add_argument ( ' --server ' , required = True , help = " Required: The name of your server (e.g. `mstdn.thms.uk`) " )
argparser . add_argument ( ' --access-token ' , required = True , help = " Required: The access token can be generated at https://<server>/settings/applications, and must have read:search, read:statuses and admin:read:accounts scopes " )
argparser . add_argument ( ' --reply-interval-in-hours ' , required = False , type = int , default = 0 , help = " Fetch remote replies to posts that have received replies from users on your own instance in this period " )
argparser . add_argument ( ' --home-timeline-length ' , required = False , type = int , default = 0 , help = " Look for replies to posts in the API-Key owner ' s home timeline, up to this many posts " )
argparser . add_argument ( ' --user ' , required = False , default = ' ' , help = " Use together with --max-followings or --max-followers to tell us which user ' s followings/followers we should backfill " )
argparser . add_argument ( ' --max-followings ' , required = False , type = int , default = 0 , help = " Backfill posts for new accounts followed by --user. We ' ll backfill at most this many followings ' posts " )
argparser . add_argument ( ' --max-followers ' , required = False , type = int , default = 0 , help = " Backfill posts for new accounts following --user. We ' ll backfill at most this many followers ' posts " )
2023-03-21 09:18:23 +01:00
argparser . add_argument ( ' --max-follow-requests ' , required = False , type = int , default = 0 , help = " Backfill posts of the API key owners pending follow requests. We ' ll backfill at most this many requester ' s posts " )
2023-03-21 20:23:14 +01:00
argparser . add_argument ( ' --max-bookmarks ' , required = False , type = int , default = 0 , help = " Fetch remote replies to the API key owners Bookmarks. We ' ll fetch replies to at most this many bookmarks " )
2023-03-28 08:04:03 +02:00
argparser . add_argument ( ' --from-notifications ' , required = False , type = int , default = 0 , help = " Backfill accounts of anyone appearing in your notifications, during the last hours " )
argparser . add_argument ( ' --remember-users-for-hours ' , required = False , type = int , default = 24 * 7 , help = " How long to remember users that you aren ' t following for, before trying to backfill them again. " )
2023-03-19 14:37:55 +01:00
argparser . add_argument ( ' --http-timeout ' , required = False , type = int , default = 5 , help = " The timeout for any HTTP requests to your own, or other instances. " )
2023-03-21 09:26:48 +01:00
argparser . add_argument ( ' --lock-hours ' , required = False , type = int , default = 24 , help = " The lock timeout in hours. " )
2023-03-21 15:42:31 +01:00
argparser . add_argument ( ' --on-done ' , required = False , default = None , help = " Provide a url that will be pinged when processing has completed. You can use this for ' dead man switch ' monitoring of your task " )
argparser . add_argument ( ' --on-start ' , required = False , default = None , help = " Provide a url that will be pinged when processing is starting. You can use this for ' dead man switch ' monitoring of your task " )
argparser . add_argument ( ' --on-fail ' , required = False , default = None , help = " Provide a url that will be pinged when processing has failed. You can use this for ' dead man switch ' monitoring of your task " )
2023-03-16 09:25:24 +01:00
def pull_context (
server ,
access_token ,
seen_urls ,
replied_toot_server_ids ,
reply_interval_hours ,
max_home_timeline_length ,
max_followings ,
backfill_followings_for_user ,
known_followings ,
2023-03-20 10:12:16 +01:00
max_followers ,
2023-03-21 20:23:14 +01:00
max_follow_requests ,
2023-03-28 08:04:03 +02:00
max_bookmarks ,
recently_checked_users ,
from_notifications
2023-03-16 09:25:24 +01:00
) :
parsed_urls = { }
2023-03-28 08:04:03 +02:00
all_known_users = OrderedSet ( list ( known_followings ) + list ( recently_checked_users ) )
2023-03-16 09:25:24 +01:00
if reply_interval_hours > 0 :
""" pull the context toots of toots user replied to, from their
original server , and add them to the local server . """
user_ids = get_active_user_ids ( server , access_token , reply_interval_hours )
reply_toots = get_all_reply_toots (
server , user_ids , access_token , seen_urls , reply_interval_hours
)
known_context_urls = get_all_known_context_urls ( server , reply_toots , parsed_urls )
seen_urls . update ( known_context_urls )
replied_toot_ids = get_all_replied_toot_server_ids (
server , reply_toots , replied_toot_server_ids , parsed_urls
)
context_urls = get_all_context_urls ( server , replied_toot_ids )
add_context_urls ( server , access_token , context_urls , seen_urls )
if max_home_timeline_length > 0 :
""" Do the same with any toots on the key owner ' s home timeline """
timeline_toots = get_timeline ( server , access_token , max_home_timeline_length )
known_context_urls = get_all_known_context_urls ( server , timeline_toots , parsed_urls )
add_context_urls ( server , access_token , known_context_urls , seen_urls )
2023-03-30 13:17:05 +02:00
# Backfill any post authors, and any mentioned users
2023-03-30 13:00:36 +02:00
mentioned_users = [ ]
2023-03-30 13:17:05 +02:00
for toot in timeline_toots :
mentioned_users = mentioned_users + [ toot [ ' account ' ] ]
if ( len ( toot [ ' mentions ' ] ) ) :
mentioned_users = mentioned_users + toot [ ' mentions ' ]
2023-03-30 13:00:36 +02:00
add_user_posts ( server , access_token , filter_known_users ( mentioned_users , all_known_users ) , recently_checked_users , all_known_users , seen_urls )
2023-03-28 08:04:03 +02:00
if max_followings > 0 :
log ( f " Getting posts from last { max_followings } followings " )
user_id = get_user_id ( server , backfill_followings_for_user , access_token )
followings = get_new_followings ( server , user_id , max_followings , all_known_users )
add_user_posts ( server , access_token , followings , known_followings , all_known_users , seen_urls )
2023-03-16 09:25:24 +01:00
2023-03-28 08:04:03 +02:00
if max_followers > 0 :
log ( f " Getting posts from last { max_followers } followers " )
user_id = get_user_id ( server , backfill_followings_for_user , access_token )
followers = get_new_followers ( server , user_id , max_followers , all_known_users )
add_user_posts ( server , access_token , followers , recently_checked_users , all_known_users , seen_urls )
2023-03-16 09:25:24 +01:00
2023-03-20 10:12:16 +01:00
if max_follow_requests > 0 :
2023-03-20 22:02:38 +01:00
log ( f " Getting posts from last { max_follow_requests } follow requests " )
2023-03-28 08:04:03 +02:00
follow_requests = get_new_follow_requests ( server , access_token , max_follow_requests , all_known_users )
add_user_posts ( server , access_token , follow_requests , recently_checked_users , all_known_users , seen_urls )
if from_notifications > 0 :
log ( f " Getting notifications for last { from_notifications } hours " )
notification_users = get_notification_users ( server , access_token , all_known_users , from_notifications )
add_user_posts ( server , access_token , notification_users , recently_checked_users , all_known_users , seen_urls )
2023-03-20 10:12:16 +01:00
2023-03-21 20:23:14 +01:00
if max_bookmarks > 0 :
log ( f " Pulling replies to the last { max_bookmarks } bookmarks " )
bookmarks = get_bookmarks ( server , access_token , max_bookmarks )
known_context_urls = get_all_known_context_urls ( server , bookmarks , parsed_urls )
add_context_urls ( server , access_token , known_context_urls , seen_urls )
2023-03-28 08:04:03 +02:00
def get_notification_users ( server , access_token , known_users , max_age ) :
since = datetime . now ( datetime . now ( ) . astimezone ( ) . tzinfo ) - timedelta ( hours = max_age )
notifications = get_paginated_mastodon ( f " https:// { server } /api/v1/notifications " , since , headers = {
" Authorization " : f " Bearer { access_token } " ,
} )
notification_users = [ ]
for notification in notifications :
notificationDate = parser . parse ( notification [ ' created_at ' ] )
if ( notificationDate > = since and notification [ ' account ' ] not in notification_users ) :
notification_users . append ( notification [ ' account ' ] )
new_notification_users = filter_known_users ( notification_users , known_users )
log ( f " Found { len ( notification_users ) } users in notifications, { len ( new_notification_users ) } of which are new " )
return new_notification_users
2023-03-21 20:23:14 +01:00
def get_bookmarks ( server , access_token , max ) :
return get_paginated_mastodon ( f " https:// { server } /api/v1/bookmarks " , max , {
" Authorization " : f " Bearer { access_token } " ,
} )
2023-03-28 08:04:03 +02:00
def add_user_posts ( server , access_token , followings , know_followings , all_known_users , seen_urls ) :
2023-03-16 09:25:24 +01:00
for user in followings :
2023-03-30 13:17:05 +02:00
if user [ ' acct ' ] not in all_known_users and not user [ ' url ' ] . startswith ( f " https:// { server } / " ) :
2023-03-30 13:00:36 +02:00
posts = get_user_posts ( user , know_followings , server )
if ( posts != None ) :
count = 0
failed = 0
for post in posts :
if post [ ' url ' ] != None and post [ ' url ' ] not in seen_urls :
added = add_post_with_context ( post , server , access_token , seen_urls )
if added is True :
seen_urls . add ( post [ ' url ' ] )
count + = 1
else :
failed + = 1
log ( f " Added { count } posts for user { user [ ' acct ' ] } with { failed } errors " )
if failed == 0 :
know_followings . add ( user [ ' acct ' ] )
all_known_users . add ( user [ ' acct ' ] )
2023-03-16 09:25:24 +01:00
2023-03-30 12:24:53 +02:00
def add_post_with_context ( post , server , access_token , seen_urls ) :
added = add_context_url ( post [ ' url ' ] , server , access_token )
if added is True :
seen_urls . add ( post [ ' url ' ] )
parsed_urls = { }
parsed = parse_url ( post [ ' url ' ] , parsed_urls )
if parsed == None :
return True
known_context_urls = get_all_known_context_urls ( server , [ post ] , parsed_urls )
add_context_urls ( server , access_token , known_context_urls , seen_urls )
return True
return False
2023-03-16 09:25:24 +01:00
def get_user_posts ( user , know_followings , server ) :
parsed_url = parse_user_url ( user [ ' url ' ] )
if parsed_url == None :
# We are adding it as 'known' anyway, because we won't be able to fix this.
know_followings . add ( user [ ' acct ' ] )
return None
if ( parsed_url [ 0 ] == server ) :
log ( f " { user [ ' acct ' ] } is a local user. Skip " )
know_followings . add ( user [ ' acct ' ] )
return None
try :
user_id = get_user_id ( parsed_url [ 0 ] , parsed_url [ 1 ] )
except Exception as ex :
log ( f " Error getting user ID for user { user [ ' acct ' ] } : { ex } " )
return None
try :
url = f " https:// { parsed_url [ 0 ] } /api/v1/accounts/ { user_id } /statuses?limit=40 "
response = get ( url )
if ( response . status_code == 200 ) :
return response . json ( )
elif response . status_code == 404 :
raise Exception (
f " User { user [ ' acct ' ] } was not found on server { parsed_url [ 0 ] } "
)
else :
raise Exception (
f " Error getting URL { url } . Status code: { response . status_code } "
)
except Exception as ex :
log ( f " Error getting posts for user { user [ ' acct ' ] } : { ex } " )
return None
2023-03-20 10:12:16 +01:00
def get_new_follow_requests ( server , access_token , max , known_followings ) :
""" Get any new follow requests for the specified user, up to the max number provided """
2023-03-16 09:25:24 +01:00
2023-03-20 10:12:16 +01:00
follow_requests = get_paginated_mastodon ( f " https:// { server } /api/v1/follow_requests " , max , {
" Authorization " : f " Bearer { access_token } " ,
} )
2023-03-16 09:25:24 +01:00
2023-03-20 10:12:16 +01:00
# Remove any we already know about
2023-03-28 08:04:03 +02:00
new_follow_requests = filter_known_users ( follow_requests , known_followings )
2023-03-20 10:12:16 +01:00
log ( f " Got { len ( follow_requests ) } follow_requests, { len ( new_follow_requests ) } of which are new " )
return new_follow_requests
2023-03-16 09:25:24 +01:00
2023-03-28 08:04:03 +02:00
def filter_known_users ( users , known_users ) :
return list ( filter (
lambda user : user [ ' acct ' ] not in known_users ,
users
) )
2023-03-20 10:12:16 +01:00
def get_new_followers ( server , user_id , max , known_followers ) :
""" Get any new followings for the specified user, up to the max number provided """
followers = get_paginated_mastodon ( f " https:// { server } /api/v1/accounts/ { user_id } /followers " , max )
2023-03-16 09:25:24 +01:00
# Remove any we already know about
2023-03-28 08:04:03 +02:00
new_followers = filter_known_users ( followers , known_followers )
2023-03-16 09:25:24 +01:00
log ( f " Got { len ( followers ) } followers, { len ( new_followers ) } of which are new " )
return new_followers
def get_new_followings ( server , user_id , max , known_followings ) :
""" Get any new followings for the specified user, up to the max number provided """
2023-03-20 10:12:16 +01:00
following = get_paginated_mastodon ( f " https:// { server } /api/v1/accounts/ { user_id } /following " , max )
2023-03-16 09:25:24 +01:00
# Remove any we already know about
2023-03-28 08:04:03 +02:00
new_followings = filter_known_users ( following , known_followings )
2023-03-16 09:25:24 +01:00
log ( f " Got { len ( following ) } followings, { len ( new_followings ) } of which are new " )
return new_followings
2023-03-28 08:04:03 +02:00
def get_user_id ( server , user = None , access_token = None ) :
2023-03-16 09:25:24 +01:00
""" Get the user id from the server, using a username """
2023-03-28 08:04:03 +02:00
headers = { }
if user != None and user != ' ' :
url = f " https:// { server } /api/v1/accounts/lookup?acct= { user } "
elif access_token != None :
url = f " https:// { server } /api/v1/accounts/verify_credentials "
headers = {
" Authorization " : f " Bearer { access_token } " ,
}
else :
raise Exception ( ' You must supply either a user name or an access token, to get an user ID ' )
2023-03-16 09:25:24 +01:00
2023-03-28 08:04:03 +02:00
response = get ( url , headers = headers )
2023-03-16 09:25:24 +01:00
if response . status_code == 200 :
return response . json ( ) [ ' id ' ]
elif response . status_code == 404 :
raise Exception (
f " User { user } was not found on server { server } . "
)
else :
raise Exception (
f " Error getting URL { url } . Status code: { response . status_code } "
)
def get_timeline ( server , access_token , max ) :
""" Get all post in the user ' s home timeline """
url = f " https:// { server } /api/v1/timelines/home "
try :
response = get_toots ( url , access_token )
if response . status_code == 200 :
toots = response . json ( )
elif response . status_code == 401 :
raise Exception (
f " Error getting URL { url } . Status code: { response . status_code } . "
" Ensure your access token is correct "
)
elif response . status_code == 403 :
raise Exception (
f " Error getting URL { url } . Status code: { response . status_code } . "
" Make sure you have the read:statuses scope enabled for your access token. "
)
else :
raise Exception (
f " Error getting URL { url } . Status code: { response . status_code } "
)
# Paginate as needed
while len ( toots ) < max and ' next ' in response . links :
response = get_toots ( response . links [ ' next ' ] [ ' url ' ] , access_token )
toots = toots + response . json ( )
except Exception as ex :
log ( f " Error getting timeline toots: { ex } " )
2023-03-21 15:22:19 +01:00
raise
2023-03-16 09:25:24 +01:00
log ( f " Found { len ( toots ) } toots in timeline " )
return toots
def get_toots ( url , access_token ) :
response = get ( url , headers = {
" Authorization " : f " Bearer { access_token } " ,
} )
if response . status_code == 200 :
return response
elif response . status_code == 401 :
raise Exception (
f " Error getting URL { url } . Status code: { response . status_code } . "
" It looks like your access token is incorrect. "
)
elif response . status_code == 403 :
raise Exception (
f " Error getting URL { url } . Status code: { response . status_code } . "
" Make sure you have the read:statuses scope enabled for your access token. "
)
else :
raise Exception (
f " Error getting URL { url } . Status code: { response . status_code } "
)
def get_active_user_ids ( server , access_token , reply_interval_hours ) :
""" get all user IDs on the server that have posted a toot in the given
time interval """
since = datetime . now ( ) - timedelta ( days = reply_interval_hours / 24 + 1 )
url = f " https:// { server } /api/v1/admin/accounts "
resp = get ( url , headers = {
" Authorization " : f " Bearer { access_token } " ,
} )
if resp . status_code == 200 :
for user in resp . json ( ) :
last_status_at = user [ " account " ] [ " last_status_at " ]
if last_status_at is not None :
last_active = datetime . strptime ( last_status_at , " % Y- % m- %d " )
if last_active > since :
log ( f " Found active user: { user [ ' username ' ] } " )
yield user [ " id " ]
elif resp . status_code == 401 :
raise Exception (
f " Error getting user IDs on server { server } . Status code: { resp . status_code } . "
" Ensure your access token is correct "
)
elif resp . status_code == 403 :
raise Exception (
f " Error getting user IDs on server { server } . Status code: { resp . status_code } . "
" Make sure you have the admin:read:accounts scope enabled for your access token. "
)
else :
raise Exception (
f " Error getting user IDs on server { server } . Status code: { resp . status_code } "
)
def get_all_reply_toots (
server , user_ids , access_token , seen_urls , reply_interval_hours
) :
""" get all replies to other users by the given users in the last day """
replies_since = datetime . now ( ) - timedelta ( hours = reply_interval_hours )
reply_toots = list (
itertools . chain . from_iterable (
get_reply_toots (
user_id , server , access_token , seen_urls , replies_since
)
for user_id in user_ids
)
)
log ( f " Found { len ( reply_toots ) } reply toots " )
return reply_toots
def get_reply_toots ( user_id , server , access_token , seen_urls , reply_since ) :
""" get replies by the user to other users since the given date """
url = f " https:// { server } /api/v1/accounts/ { user_id } /statuses?exclude_replies=false&limit=40 "
try :
resp = get ( url , headers = {
" Authorization " : f " Bearer { access_token } " ,
} )
except Exception as ex :
log (
f " Error getting replies for user { user_id } on server { server } : { ex } "
)
return [ ]
if resp . status_code == 200 :
toots = [
toot
for toot in resp . json ( )
if toot [ " in_reply_to_id " ] is not None
and toot [ " url " ] not in seen_urls
and datetime . strptime ( toot [ " created_at " ] , " % Y- % m- %d T % H: % M: % S. %f Z " )
> reply_since
]
for toot in toots :
log ( f " Found reply toot: { toot [ ' url ' ] } " )
return toots
elif resp . status_code == 403 :
raise Exception (
f " Error getting replies for user { user_id } on server { server } . Status code: { resp . status_code } . "
" Make sure you have the read:statuses scope enabled for your access token. "
)
raise Exception (
f " Error getting replies for user { user_id } on server { server } . Status code: { resp . status_code } "
)
def get_all_known_context_urls ( server , reply_toots , parsed_urls ) :
""" get the context toots of the given toots from their original server """
known_context_urls = set (
filter (
lambda url : not url . startswith ( f " https:// { server } / " ) ,
itertools . chain . from_iterable (
get_toot_context ( * parse_url ( toot [ " url " ] if toot [ " reblog " ] is None else toot [ " reblog " ] [ " url " ] , parsed_urls ) , toot [ " url " ] )
for toot in filter (
lambda toot : toot_has_parseable_url ( toot , parsed_urls ) ,
reply_toots
)
) ,
)
)
log ( f " Found { len ( known_context_urls ) } known context toots " )
return known_context_urls
def toot_has_parseable_url ( toot , parsed_urls ) :
parsed = parse_url ( toot [ " url " ] if toot [ " reblog " ] is None else toot [ " reblog " ] [ " url " ] , parsed_urls )
if ( parsed is None ) :
return False
return True
def get_all_replied_toot_server_ids (
server , reply_toots , replied_toot_server_ids , parsed_urls
) :
""" get the server and ID of the toots the given toots replied to """
return filter (
lambda x : x is not None ,
(
get_replied_toot_server_id ( server , toot , replied_toot_server_ids , parsed_urls )
for toot in reply_toots
) ,
)
def get_replied_toot_server_id ( server , toot , replied_toot_server_ids , parsed_urls ) :
""" get the server and ID of the toot the given toot replied to """
in_reply_to_id = toot [ " in_reply_to_id " ]
in_reply_to_account_id = toot [ " in_reply_to_account_id " ]
mentions = [
mention
for mention in toot [ " mentions " ]
if mention [ " id " ] == in_reply_to_account_id
]
if len ( mentions ) == 0 :
return None
mention = mentions [ 0 ]
o_url = f " https:// { server } /@ { mention [ ' acct ' ] } / { in_reply_to_id } "
if o_url in replied_toot_server_ids :
return replied_toot_server_ids [ o_url ]
url = get_redirect_url ( o_url )
if url is None :
return None
match = parse_url ( url , parsed_urls )
if match is not None :
replied_toot_server_ids [ o_url ] = ( url , match )
return ( url , match )
log ( f " Error parsing toot URL { url } " )
replied_toot_server_ids [ o_url ] = None
return None
def parse_user_url ( url ) :
match = parse_mastodon_profile_url ( url )
if match is not None :
return match
match = parse_pleroma_profile_url ( url )
if match is not None :
return match
2023-03-31 08:51:18 +02:00
match = parse_pixelfed_profile_url ( url )
if match is not None :
return match
2023-03-16 09:25:24 +01:00
log ( f " Error parsing Profile URL { url } " )
return None
def parse_url ( url , parsed_urls ) :
if url not in parsed_urls :
match = parse_mastodon_url ( url )
if match is not None :
parsed_urls [ url ] = match
if url not in parsed_urls :
match = parse_pleroma_url ( url )
if match is not None :
parsed_urls [ url ] = match
2023-03-31 08:51:18 +02:00
if url not in parsed_urls :
match = parse_pixelfed_url ( url )
if match is not None :
parsed_urls [ url ] = match
2023-03-16 09:25:24 +01:00
if url not in parsed_urls :
log ( f " Error parsing toot URL { url } " )
parsed_urls [ url ] = None
return parsed_urls [ url ]
def parse_mastodon_profile_url ( url ) :
""" parse a Mastodon Profile URL and return the server and username """
match = re . match (
r " https://(?P<server>.*)/@(?P<username>.*) " , url
)
if match is not None :
return ( match . group ( " server " ) , match . group ( " username " ) )
return None
def parse_mastodon_url ( url ) :
""" parse a Mastodon URL and return the server and ID """
match = re . match (
r " https://(?P<server>.*)/@(?P<username>.*)/(?P<toot_id>.*) " , url
)
if match is not None :
return ( match . group ( " server " ) , match . group ( " toot_id " ) )
return None
def parse_pleroma_url ( url ) :
""" parse a Pleroma URL and return the server and ID """
match = re . match ( r " https://(?P<server>.*)/objects/(?P<toot_id>.*) " , url )
if match is not None :
server = match . group ( " server " )
url = get_redirect_url ( url )
if url is None :
return None
match = re . match ( r " /notice/(?P<toot_id>.*) " , url )
if match is not None :
return ( server , match . group ( " toot_id " ) )
return None
return None
def parse_pleroma_profile_url ( url ) :
""" parse a Pleroma Profile URL and return the server and username """
match = re . match ( r " https://(?P<server>.*)/users/(?P<username>.*) " , url )
if match is not None :
return ( match . group ( " server " ) , match . group ( " username " ) )
return None
2023-03-31 08:51:18 +02:00
def parse_pixelfed_url ( url ) :
""" parse a Pixelfed URL and return the server and ID """
match = re . match (
r " https://(?P<server>.*)/p/(?P<username>.*)/(?P<toot_id>.*) " , url
)
if match is not None :
return ( match . group ( " server " ) , match . group ( " toot_id " ) )
return None
def parse_pixelfed_profile_url ( url ) :
""" parse a Pixelfed Profile URL and return the server and username """
match = re . match ( r " https://(?P<server>.*)/(?P<username>.*) " , url )
if match is not None :
return ( match . group ( " server " ) , match . group ( " username " ) )
return None
2023-03-16 09:25:24 +01:00
def get_redirect_url ( url ) :
""" get the URL given URL redirects to """
try :
resp = requests . head ( url , allow_redirects = False , timeout = 5 , headers = {
2023-03-27 08:02:59 +02:00
' User-Agent ' : ' FediFetcher (https://go.thms.uk/mgr) '
2023-03-16 09:25:24 +01:00
} )
except Exception as ex :
log ( f " Error getting redirect URL for URL { url } . Exception: { ex } " )
return None
if resp . status_code == 200 :
return url
elif resp . status_code == 302 :
redirect_url = resp . headers [ " Location " ]
log ( f " Discovered redirect for URL { url } " )
return redirect_url
else :
log (
f " Error getting redirect URL for URL { url } . Status code: { resp . status_code } "
)
return None
def get_all_context_urls ( server , replied_toot_ids ) :
""" get the URLs of the context toots of the given toots """
return filter (
lambda url : not url . startswith ( f " https:// { server } / " ) ,
itertools . chain . from_iterable (
get_toot_context ( server , toot_id , url )
for ( url , ( server , toot_id ) ) in replied_toot_ids
) ,
)
def get_toot_context ( server , toot_id , toot_url ) :
""" get the URLs of the context toots of the given toot """
url = f " https:// { server } /api/v1/statuses/ { toot_id } /context "
try :
resp = get ( url )
except Exception as ex :
log ( f " Error getting context for toot { toot_url } . Exception: { ex } " )
return [ ]
if resp . status_code == 200 :
try :
res = resp . json ( )
log ( f " Got context for toot { toot_url } " )
return ( toot [ " url " ] for toot in ( res [ " ancestors " ] + res [ " descendants " ] ) )
except Exception as ex :
log ( f " Error parsing context for toot { toot_url } . Exception: { ex } " )
return [ ]
elif resp . status_code == 429 :
reset = datetime . strptime ( resp . headers [ ' x-ratelimit-reset ' ] , ' % Y- % m- %d T % H: % M: % S. %f Z ' )
log ( f " Rate Limit hit when getting context for { toot_url } . Waiting to retry at { resp . headers [ ' x-ratelimit-reset ' ] } " )
time . sleep ( ( reset - datetime . now ( ) ) . total_seconds ( ) + 1 )
return get_toot_context ( server , toot_id , toot_url )
log (
f " Error getting context for toot { toot_url } . Status code: { resp . status_code } "
)
return [ ]
def add_context_urls ( server , access_token , context_urls , seen_urls ) :
""" add the given toot URLs to the server """
count = 0
failed = 0
for url in context_urls :
if url not in seen_urls :
added = add_context_url ( url , server , access_token )
if added is True :
seen_urls . add ( url )
count + = 1
else :
failed + = 1
log ( f " Added { count } new context toots (with { failed } failures) " )
def add_context_url ( url , server , access_token ) :
""" add the given toot URL to the server """
search_url = f " https:// { server } /api/v2/search?q= { url } &resolve=true&limit=1 "
try :
resp = get ( search_url , headers = {
" Authorization " : f " Bearer { access_token } " ,
} )
except Exception as ex :
log (
f " Error adding url { search_url } to server { server } . Exception: { ex } "
)
return False
if resp . status_code == 200 :
log ( f " Added context url { url } " )
return True
elif resp . status_code == 403 :
log (
f " Error adding url { search_url } to server { server } . Status code: { resp . status_code } . "
" Make sure you have the read:search scope enabled for your access token. "
)
return False
elif resp . status_code == 429 :
reset = datetime . strptime ( resp . headers [ ' x-ratelimit-reset ' ] , ' % Y- % m- %d T % H: % M: % S. %f Z ' )
log ( f " Rate Limit hit when adding url { search_url } . Waiting to retry at { resp . headers [ ' x-ratelimit-reset ' ] } " )
time . sleep ( ( reset - datetime . now ( ) ) . total_seconds ( ) + 1 )
return add_context_url ( url , server , access_token )
else :
log (
f " Error adding url { search_url } to server { server } . Status code: { resp . status_code } "
)
return False
2023-03-20 10:12:16 +01:00
def get_paginated_mastodon ( url , max , headers = { } , timeout = 0 , max_tries = 5 ) :
""" Make a paginated request to mastodon """
2023-03-28 08:04:03 +02:00
if ( isinstance ( max , int ) ) :
furl = f " { url } ?limit= { max } "
else :
furl = url
response = get ( furl , headers , timeout , max_tries )
2023-03-20 10:12:16 +01:00
if response . status_code != 200 :
if response . status_code == 401 :
raise Exception (
f " Error getting URL { url } . Status code: { response . status_code } . "
" Ensure your access token is correct "
)
elif response . status_code == 403 :
raise Exception (
f " Error getting URL { url } . Status code: { response . status_code } . "
" Make sure you have the correct scopes enabled for your access token. "
)
else :
raise Exception (
f " Error getting URL { url } . Status code: { response . status_code } "
)
result = response . json ( )
2023-03-28 08:04:03 +02:00
if ( isinstance ( max , int ) ) :
while len ( result ) < max and ' next ' in response . links :
response = get ( response . links [ ' next ' ] [ ' url ' ] , headers , timeout , max_tries )
result = result + response . json ( )
else :
while parser . parse ( result [ - 1 ] [ ' created_at ' ] ) > = max and ' next ' in response . links :
response = get ( response . links [ ' next ' ] [ ' url ' ] , headers , timeout , max_tries )
result = result + response . json ( )
2023-03-20 10:12:16 +01:00
return result
2023-03-17 09:26:38 +01:00
def get ( url , headers = { } , timeout = 0 , max_tries = 5 ) :
2023-03-16 09:25:24 +01:00
""" A simple wrapper to make a get request while providing our user agent, and respecting rate limits """
h = headers . copy ( )
if ' User-Agent ' not in h :
2023-03-27 08:02:59 +02:00
h [ ' User-Agent ' ] = ' FediFetcher (https://go.thms.uk/mgr) '
2023-03-17 09:26:38 +01:00
if timeout == 0 :
timeout = arguments . http_timeout
2023-03-16 09:25:24 +01:00
response = requests . get ( url , headers = h , timeout = timeout )
if response . status_code == 429 :
if max_tries > 0 :
reset = parser . parse ( response . headers [ ' x-ratelimit-reset ' ] )
now = datetime . now ( datetime . now ( ) . astimezone ( ) . tzinfo )
wait = ( reset - now ) . total_seconds ( ) + 1
log ( f " Rate Limit hit requesting { url } . Waiting { wait } sec to retry at { response . headers [ ' x-ratelimit-reset ' ] } " )
time . sleep ( wait )
return get ( url , headers , timeout , max_tries - 1 )
raise Exception ( f " Maximum number of retries exceeded for rate limited request { url } " )
return response
def log ( text ) :
print ( f " { datetime . now ( ) } { datetime . now ( ) . astimezone ( ) . tzinfo } : { text } " )
class OrderedSet :
""" An ordered set implementation over a dict """
def __init__ ( self , iterable ) :
self . _dict = { }
2023-03-28 08:04:03 +02:00
if isinstance ( iterable , dict ) :
for item in iterable :
if isinstance ( iterable [ item ] , str ) :
self . add ( item , parser . parse ( iterable [ item ] ) )
else :
self . add ( item , iterable [ item ] )
else :
for item in iterable :
self . add ( item )
2023-03-16 09:25:24 +01:00
2023-03-28 08:04:03 +02:00
def add ( self , item , time = None ) :
2023-03-16 09:25:24 +01:00
if item not in self . _dict :
2023-03-28 08:04:03 +02:00
if ( time == None ) :
self . _dict [ item ] = datetime . now ( datetime . now ( ) . astimezone ( ) . tzinfo )
else :
self . _dict [ item ] = time
def pop ( self , item ) :
self . _dict . pop ( item )
def get ( self , item ) :
return self . _dict [ item ]
2023-03-16 09:25:24 +01:00
def update ( self , iterable ) :
for item in iterable :
self . add ( item )
def __contains__ ( self , item ) :
return item in self . _dict
def __iter__ ( self ) :
return iter ( self . _dict )
def __len__ ( self ) :
return len ( self . _dict )
2023-03-28 08:04:03 +02:00
def toJSON ( self ) :
return json . dump ( self . _dict , f , default = str )
2023-03-16 09:25:24 +01:00
if __name__ == " __main__ " :
2023-03-21 08:55:52 +01:00
start = datetime . now ( )
2023-03-27 08:02:59 +02:00
log ( f " Starting FediFetcher " )
2023-03-21 09:17:03 +01:00
arguments = argparser . parse_args ( )
2023-03-16 09:25:24 +01:00
2023-03-21 15:42:31 +01:00
runId = uuid . uuid4 ( )
2023-03-22 12:40:07 +01:00
if ( arguments . on_start != None and arguments . on_start != ' ' ) :
2023-03-21 15:42:31 +01:00
try :
get ( f " { arguments . on_start } ?rid= { runId } " )
except Exception as ex :
log ( f " Error getting callback url: { ex } " )
2023-03-20 22:14:08 +01:00
LOCK_FILE = ' artifacts/lock.lock '
if ( os . path . exists ( LOCK_FILE ) ) :
log ( f " Lock file exists at { LOCK_FILE } " )
2023-03-21 09:17:03 +01:00
try :
with open ( LOCK_FILE , " r " , encoding = " utf-8 " ) as f :
lock_time = parser . parse ( f . read ( ) )
if ( datetime . now ( ) - lock_time ) . total_seconds ( ) > = arguments . lock_hours * 60 * 60 :
os . remove ( LOCK_FILE )
log ( f " Lock file has expired. Removed lock file. " )
else :
log ( f " Lock file age is { datetime . now ( ) - lock_time } - below --lock-hours= { arguments . lock_hours } provided. " )
2023-03-22 12:40:07 +01:00
if ( arguments . on_fail != None and arguments . on_fail != ' ' ) :
2023-03-21 15:42:31 +01:00
try :
get ( f " { arguments . on_fail } ?rid= { runId } " )
except Exception as ex :
log ( f " Error getting callback url: { ex } " )
2023-03-21 09:17:03 +01:00
sys . exit ( 1 )
except Exception :
log ( f " Cannot read logfile age - aborting. " )
2023-03-22 12:40:07 +01:00
if ( arguments . on_fail != None and arguments . on_fail != ' ' ) :
2023-03-21 15:42:31 +01:00
try :
get ( f " { arguments . on_fail } ?rid= { runId } " )
except Exception as ex :
log ( f " Error getting callback url: { ex } " )
2023-03-21 09:17:03 +01:00
sys . exit ( 1 )
2023-03-16 09:25:24 +01:00
2023-03-20 22:14:08 +01:00
with open ( LOCK_FILE , " w " , encoding = " utf-8 " ) as f :
f . write ( f " { datetime . now ( ) } " )
2023-03-16 09:25:24 +01:00
2023-03-20 22:14:08 +01:00
try :
SEEN_URLS_FILE = " artifacts/seen_urls "
REPLIED_TOOT_SERVER_IDS_FILE = " artifacts/replied_toot_server_ids "
KNOWN_FOLLOWINGS_FILE = " artifacts/known_followings "
2023-03-28 08:04:03 +02:00
RECENTLY_CHECKED_USERS_FILE = " artifacts/recently_checked_users "
2023-03-20 22:14:08 +01:00
SEEN_URLS = OrderedSet ( [ ] )
if os . path . exists ( SEEN_URLS_FILE ) :
with open ( SEEN_URLS_FILE , " r " , encoding = " utf-8 " ) as f :
SEEN_URLS = OrderedSet ( f . read ( ) . splitlines ( ) )
REPLIED_TOOT_SERVER_IDS = { }
if os . path . exists ( REPLIED_TOOT_SERVER_IDS_FILE ) :
with open ( REPLIED_TOOT_SERVER_IDS_FILE , " r " , encoding = " utf-8 " ) as f :
REPLIED_TOOT_SERVER_IDS = json . load ( f )
KNOWN_FOLLOWINGS = OrderedSet ( [ ] )
if os . path . exists ( KNOWN_FOLLOWINGS_FILE ) :
with open ( KNOWN_FOLLOWINGS_FILE , " r " , encoding = " utf-8 " ) as f :
KNOWN_FOLLOWINGS = OrderedSet ( f . read ( ) . splitlines ( ) )
2023-03-28 08:04:03 +02:00
RECENTLY_CHECKED_USERS = OrderedSet ( { } )
if os . path . exists ( RECENTLY_CHECKED_USERS_FILE ) :
with open ( RECENTLY_CHECKED_USERS_FILE , " r " , encoding = " utf-8 " ) as f :
RECENTLY_CHECKED_USERS = OrderedSet ( json . load ( f ) )
# Remove any users whose last check is too long in the past from the list
for user in list ( RECENTLY_CHECKED_USERS ) :
lastCheck = RECENTLY_CHECKED_USERS . get ( user )
userAge = datetime . now ( lastCheck . tzinfo ) - lastCheck
if ( userAge . total_seconds ( ) > arguments . remember_users_for_hours * 60 * 60 ) :
RECENTLY_CHECKED_USERS . pop ( user )
2023-03-20 22:14:08 +01:00
pull_context (
arguments . server ,
arguments . access_token ,
SEEN_URLS ,
REPLIED_TOOT_SERVER_IDS ,
arguments . reply_interval_in_hours ,
arguments . home_timeline_length ,
arguments . max_followings ,
arguments . user ,
KNOWN_FOLLOWINGS ,
arguments . max_followers ,
2023-03-21 20:23:14 +01:00
arguments . max_follow_requests ,
2023-03-28 08:04:03 +02:00
arguments . max_bookmarks ,
RECENTLY_CHECKED_USERS ,
arguments . from_notifications ,
)
2023-03-16 09:25:24 +01:00
2023-03-20 22:14:08 +01:00
with open ( KNOWN_FOLLOWINGS_FILE , " w " , encoding = " utf-8 " ) as f :
f . write ( " \n " . join ( list ( KNOWN_FOLLOWINGS ) [ - 10000 : ] ) )
with open ( SEEN_URLS_FILE , " w " , encoding = " utf-8 " ) as f :
f . write ( " \n " . join ( list ( SEEN_URLS ) [ - 10000 : ] ) )
with open ( REPLIED_TOOT_SERVER_IDS_FILE , " w " , encoding = " utf-8 " ) as f :
json . dump ( dict ( list ( REPLIED_TOOT_SERVER_IDS . items ( ) ) [ - 10000 : ] ) , f )
2023-03-28 08:04:03 +02:00
with open ( RECENTLY_CHECKED_USERS_FILE , " w " , encoding = " utf-8 " ) as f :
RECENTLY_CHECKED_USERS . toJSON ( )
2023-03-21 09:40:22 +01:00
os . remove ( LOCK_FILE )
2023-03-20 22:14:08 +01:00
2023-03-22 12:40:07 +01:00
if ( arguments . on_done != None and arguments . on_done != ' ' ) :
2023-03-21 11:27:43 +01:00
try :
2023-03-21 15:42:31 +01:00
get ( f " { arguments . on_done } ?rid= { runId } " )
2023-03-21 11:27:43 +01:00
except Exception as ex :
log ( f " Error getting callback url: { ex } " )
2023-03-21 08:55:52 +01:00
log ( f " Processing finished in { datetime . now ( ) - start } . " )
2023-03-20 22:14:08 +01:00
except Exception as ex :
2023-03-21 09:40:22 +01:00
os . remove ( LOCK_FILE )
2023-03-21 08:55:52 +01:00
log ( f " Job failed after { datetime . now ( ) - start } . " )
2023-03-22 12:40:07 +01:00
if ( arguments . on_fail != None and arguments . on_fail != ' ' ) :
2023-03-21 15:42:31 +01:00
try :
get ( f " { arguments . on_fail } ?rid= { runId } " )
except Exception as ex :
log ( f " Error getting callback url: { ex } " )
2023-03-20 22:14:08 +01:00
raise