From 382b06abbb850cece613acb73922c19649c73a89 Mon Sep 17 00:00:00 2001 From: Michael Thomas Date: Mon, 13 Mar 2023 11:00:24 +0000 Subject: [PATCH] Add option to backfill recent followings' posts --- .gitignore | 3 +- get_context.py | 172 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 173 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 0570248..f8aaefb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ .vscode/launch.json -artifacts/replied_toot_server_ids -artifacts/seen_urls +artifacts/* diff --git a/get_context.py b/get_context.py index 37db593..82a6c7c 100644 --- a/get_context.py +++ b/get_context.py @@ -16,6 +16,9 @@ def pull_context( replied_toot_server_ids, reply_interval_hours, max_home_timeline_length, + max_followings, + backfill_followings_for_user, + known_followings ): parsed_urls = {} @@ -42,6 +45,125 @@ def pull_context( known_context_urls = get_all_known_context_urls(server, timeline_toots,parsed_urls) add_context_urls(server, access_token, known_context_urls, seen_urls) + if max_followings > 0 and backfill_followings_for_user != '': + print(f"Getting posts from {backfill_followings_for_user}'s last {max_followings} followings") + user_id = get_user_id(server, backfill_followings_for_user) + followings = get_new_followings(server, user_id, max_followings, known_followings) + add_following_posts(server, access_token, followings, known_followings, seen_urls, parsed_urls) + +def add_following_posts(server, access_token, followings, know_followings, seen_urls, parsed_urls): + for user in followings: + posts = get_user_posts(user, know_followings, server) + + if(posts != None): + count = 0 + failed = 0 + for post in posts: + if post['url'] != None and post['url'] not in seen_urls: + added = add_context_url(post['url'], server, access_token) + if added is True: + seen_urls.add(post['url']) + count += 1 + else: + failed += 1 + print(f"Added {count} posts for user {user['acct']} with {failed} errors") + if failed == 0: + know_followings.add(user['acct']) + +def get_user_posts(user, know_followings, server): + parsed_url = parse_user_url(user['url']) + + if parsed_url == None: + know_followings.add(user['acct']) + return None + + if(parsed_url[0] == server): + print(f"{user['acct']} is a local user. Skip") + know_followings.add(user['acct']) + return None + + try: + user_id = get_user_id(parsed_url[0], parsed_url[1]) + except Exception as ex: + print(f"Error getting user ID for user {user['acct']}: {ex}") + return None + + url = f"https://{parsed_url[0]}/api/v1/accounts/{user_id}/statuses?limit=40" + + try: + response = requests.get(url, headers={ + 'User-Agent': 'mastodon_get_replies (https://go.thms.uk/mgr)' + }, timeout=5 + ) + + if(response.status_code == 200): + return response.json() + elif response.status_code == 404: + raise Exception( + f"User {user['acct']} was not found on server {parsed_url[0]}" + ) + else: + raise Exception( + f"Error getting URL {url}. Status code: {response.status_code}" + ) + except Exception as ex: + print(f"Error getting posts for user {user['acct']}: {ex}") + return None + +def get_new_followings(server, user_id, max, known_followings): + url = f"https://{server}/api/v1/accounts/{user_id}/following?limit={max}" + + following = [] + + response = requests.get(url, headers={ + 'User-Agent': 'mastodon_get_replies (https://go.thms.uk/mgr)' + }, timeout=5 + ) + + following = following + response.json() + + while len(following) < max and 'next' in response.links: + response = requests.get(url, headers={ + 'User-Agent': 'mastodon_get_replies (https://go.thms.uk/mgr)' + }, timeout=5) + + following = following + response.json() + + + new_followings = list(filter( + lambda user: user['acct'] not in known_followings, + following + )) + + print(f"Got {len(following)} followings, {len(new_followings)} of which are new") + + return new_followings + + + +def get_user_id(server, user): + # Get a list of the last max followings for the user + url = f"https://{server}/api/v1/accounts/lookup?acct={user}" + + + response = requests.get( + url, headers={ + 'User-Agent': 'mastodon_get_replies (https://go.thms.uk/mgr)' + }, timeout=5 + ) + + if response.status_code == 200: + return response.json()['id'] + elif response.status_code == 404: + raise Exception( + f"User {user} was not found. Try to supply just the local part of the username." + ) + else: + raise Exception( + f"Error getting URL {url}. Status code: {response.status_code}" + ) + + def get_timeline(server, access_token, max): """Get all post in the user's home timeline""" @@ -267,6 +389,19 @@ def get_replied_toot_server_id(server, toot, replied_toot_server_ids,parsed_urls replied_toot_server_ids[o_url] = None return None +def parse_user_url(url): + match = parse_mastodon_profile_url(url) + if match is not None: + return match + + match = parse_pleroma_profile_url(url) + if match is not None: + return match + + print(f"Error parsing Profile URL {url}") + + return None + def parse_url(url, parsed_urls): if url not in parsed_urls: match = parse_mastodon_url(url) @@ -284,6 +419,15 @@ def parse_url(url, parsed_urls): return parsed_urls[url] +def parse_mastodon_profile_url(url): + """parse a Mastodon Profile URL and return the server and username""" + match = re.match( + r"https://(?P.*)/@(?P.*)", url + ) + if match is not None: + return (match.group("server"), match.group("username")) + return None + def parse_mastodon_url(url): """parse a Mastodon URL and return the server and ID""" match = re.match( @@ -309,6 +453,13 @@ def parse_pleroma_url(url): return None return None +def parse_pleroma_profile_url(url): + """parse a Pleroma Profile URL and return the server and username""" + match = re.match(r"https://(?P.*)/users/(?P.*)", url) + if match is not None: + return (match.group("server"), match.group("username")) + return None + def get_redirect_url(url): """get the URL given URL redirects to""" @@ -481,6 +632,15 @@ Usage: python3 pull_context.py SERVER = sys.argv[2] REPLY_INTERVAL_IN_HOURS = int(sys.argv[3]) MAX_HOME_TIMELINE_LENGTH = int(sys.argv[4]) + if len(sys.argv) > 5: + MAX_FOLLOWINGS = int(sys.argv[5]) + else: + MAX_FOLLOWINGS = 0 + + if len(sys.argv) > 6: + BACKFILL_FOLLOWINGS_FOR_USER = sys.argv[6] + else: + BACKFILL_FOLLOWINGS_FOR_USER = '' print( f"Getting last {REPLY_INTERVAL_IN_HOURS} hrs of replies, and latest {MAX_HOME_TIMELINE_LENGTH} posts in home timeline from {SERVER}" @@ -488,6 +648,7 @@ Usage: python3 pull_context.py SEEN_URLS_FILE = "artifacts/seen_urls" REPLIED_TOOT_SERVER_IDS_FILE = "artifacts/replied_toot_server_ids" + KNOWN_FOLLOWINGS_FILE = "artifacts/known_followings" SEEN_URLS = OrderedSet([]) @@ -500,6 +661,11 @@ Usage: python3 pull_context.py with open(REPLIED_TOOT_SERVER_IDS_FILE, "r", encoding="utf-8") as f: REPLIED_TOOT_SERVER_IDS = json.load(f) + KNOWN_FOLLOWINGS = OrderedSet([]) + if os.path.exists(KNOWN_FOLLOWINGS_FILE): + with open(KNOWN_FOLLOWINGS_FILE, "r", encoding="utf-8") as f: + KNOWN_FOLLOWINGS = OrderedSet(f.read().splitlines()) + pull_context( SERVER, ACCESS_TOKEN, @@ -507,8 +673,14 @@ Usage: python3 pull_context.py REPLIED_TOOT_SERVER_IDS, REPLY_INTERVAL_IN_HOURS, MAX_HOME_TIMELINE_LENGTH, + MAX_FOLLOWINGS, + BACKFILL_FOLLOWINGS_FOR_USER, + KNOWN_FOLLOWINGS ) + with open(KNOWN_FOLLOWINGS_FILE, "w", encoding="utf-8") as f: + f.write("\n".join(list(KNOWN_FOLLOWINGS)[-10000:])) + with open(SEEN_URLS_FILE, "w", encoding="utf-8") as f: f.write("\n".join(list(SEEN_URLS)[-10000:]))