From c1f0e8ac61b58d6f39cf188b4343fe8c61b73a39 Mon Sep 17 00:00:00 2001 From: Timothy Quilling Date: Thu, 29 Jun 2023 00:20:12 -0400 Subject: [PATCH 1/9] feat: lemmy --- find_posts.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/find_posts.py b/find_posts.py index e442d9c..2ad366b 100644 --- a/find_posts.py +++ b/find_posts.py @@ -439,6 +439,10 @@ def parse_user_url(url): if match is not None: return match + match = parse_lemmy_profile_url(url) + if match is not None: + return match + log(f"Error parsing Profile URL {url}") return None @@ -459,6 +463,11 @@ def parse_url(url, parsed_urls): if match is not None: parsed_urls[url] = match + if url not in parsed_urls: + match = parse_lemmy_url(url) + if match is not None: + parsed_urls[url] = match + if url not in parsed_urls: log(f"Error parsing toot URL {url}") parsed_urls[url] = None @@ -522,6 +531,25 @@ def parse_pixelfed_profile_url(url): return (match.group("server"), match.group("username")) return None +def parse_lemmy_url(url): + """parse a Lemmy URL and return the server, and ID""" + match = re.match( + r"https://(?P[^/]+)/comment/(?P[^/]+)", url + ) + if match is None: + match = re.match( + r"https://(?P[^/]+)/post/(?P[^/]+)", url + ) + if match is not None: + return (match.group("server"), match.group("toot_id")) + return None + +def parse_lemmy_profile_url(url): + """parse a Lemmy Profile URL and return the server and username""" + match = re.match(r"https://(?P[^/]+)/u/(?P[^/]+)", url) + if match is not None: + return (match.group("server"), match.group("username")) + return None def get_redirect_url(url): """get the URL given URL redirects to""" @@ -559,6 +587,10 @@ def get_all_context_urls(server, replied_toot_ids): def get_toot_context(server, toot_id, toot_url): """get the URLs of the context toots of the given toot""" + if toot_url.find("/comment/") != -1: + return get_comment_context(server, toot_id, toot_url) + if toot_url.find("/post/") != -1: + return get_comments_urls(server, toot_id, toot_url) url = f"https://{server}/api/v1/statuses/{toot_id}/context" try: resp = get(url) @@ -585,6 +617,58 @@ def get_toot_context(server, toot_id, toot_url): ) return [] +def get_comment_context(server, toot_id, toot_url): + """get the URLs of the context toots of the given toot""" + comment = f"https://{server}/api/v3/comment?id={toot_id}" + try: + resp = get(comment) + except Exception as ex: + log(f"Error getting comment {toot_id} from {toot_url}. Exception: {ex}") + return [] + + if resp.status_code == 200: + try: + res = resp.json() + post_id = res['comment_view']['comment']['post_id'] + log(f"Got parent post ID {post_id} for comment {toot_url}") + return get_comments_urls(server, post_id, toot_url) + except Exception as ex: + log(f"Error parsing context for comment {toot_url}. Exception: {ex}") + return [] + elif resp.status_code == 429: + reset = datetime.strptime(resp.headers['x-ratelimit-reset'], '%Y-%m-%dT%H:%M:%S.%fZ') + log(f"Rate Limit hit when getting context for {toot_url}. Waiting to retry at {resp.headers['x-ratelimit-reset']}") + time.sleep((reset - datetime.now()).total_seconds() + 1) + return get_comment_context(server, toot_id, toot_url) + +def get_comments_urls(server, post_id, toot_url): + """get the URLs of the comments of the given post""" + url = f"https://{server}/api/v3/comment/list?post_id={post_id}" + try: + resp = get(url) + except Exception as ex: + log(f"Error getting comments for post {post_id} from {toot_url}. Exception: {ex}") + return [] + + if resp.status_code == 200: + try: + res = resp.json() + list_of_urls = [comment_info['comment']['ap_id'] for comment_info in res['comments']] + log(f"Got {len(list_of_urls)} comments for post {toot_url}") + return list_of_urls + except Exception as ex: + log(f"Error parsing comments for post {toot_url}. Exception: {ex}") + return [] + elif resp.status_code == 429: + reset = datetime.strptime(resp.headers['x-ratelimit-reset'], '%Y-%m-%dT%H:%M:%S.%fZ') + log(f"Rate Limit hit when getting comments for {toot_url}. Waiting to retry at {resp.headers['x-ratelimit-reset']}") + time.sleep((reset - datetime.now()).total_seconds() + 1) + return get_comments_urls(server, post_id, toot_url) + + log( + f"Error getting comments for post {toot_url}. Status code: {resp.status_code}" + ) + return [] def add_context_urls(server, access_token, context_urls, seen_urls): """add the given toot URLs to the server""" From b7ef2be02e8633d08267705f354234d5755c046f Mon Sep 17 00:00:00 2001 From: Timothy Quilling Date: Thu, 29 Jun 2023 01:57:33 -0400 Subject: [PATCH 2/9] chore: refactor get_all_known_context_urls --- find_posts.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/find_posts.py b/find_posts.py index 2ad366b..8d5fb3c 100644 --- a/find_posts.py +++ b/find_posts.py @@ -356,21 +356,20 @@ def get_reply_toots(user_id, server, access_token, seen_urls, reply_since): ) -def get_all_known_context_urls(server, reply_toots,parsed_urls): +def get_all_known_context_urls(server, reply_toots, parsed_urls): """get the context toots of the given toots from their original server""" - known_context_urls = set( - filter( - lambda url: not url.startswith(f"https://{server}/"), - itertools.chain.from_iterable( - get_toot_context(*parse_url(toot["url"] if toot["reblog"] is None else toot["reblog"]["url"],parsed_urls), toot["url"]) - for toot in filter( - lambda toot: toot_has_parseable_url(toot,parsed_urls), - reply_toots - ) - ), - ) - ) + known_context_urls = set() + + for toot in reply_toots: + if toot_has_parseable_url(toot, parsed_urls): + url = toot["url"] if toot["reblog"] is None else toot["reblog"]["url"] + parsed_url = parse_url(url, parsed_urls) + context = get_toot_context(parsed_url[0], parsed_url[1], url) + known_context_urls.update(context) # type: ignore + + known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls)) log(f"Found {len(known_context_urls)} known context toots") + return known_context_urls @@ -1050,4 +1049,4 @@ if __name__ == "__main__": get(f"{arguments.on_fail}?rid={runId}") except Exception as ex: log(f"Error getting callback url: {ex}") - raise + raise \ No newline at end of file From e290f2c05f80832c42a6c282d7d16792dab80bf8 Mon Sep 17 00:00:00 2001 From: Timothy Quilling Date: Fri, 30 Jun 2023 00:11:33 -0400 Subject: [PATCH 3/9] chore: use getters --- find_posts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/find_posts.py b/find_posts.py index 8d5fb3c..22f23db 100644 --- a/find_posts.py +++ b/find_posts.py @@ -74,7 +74,7 @@ def add_user_posts(server, access_token, followings, know_followings, all_known_ count = 0 failed = 0 for post in posts: - if post['reblog'] == None and post['url'] != None and post['url'] not in seen_urls: + if post.get('reblog') is None and post.get('url') is not None and post.get('url') not in seen_urls: added = add_post_with_context(post, server, access_token, seen_urls) if added is True: seen_urls.add(post['url']) @@ -90,7 +90,7 @@ def add_post_with_context(post, server, access_token, seen_urls): added = add_context_url(post['url'], server, access_token) if added is True: seen_urls.add(post['url']) - if (post['replies_count'] or post['in_reply_to_id']) and arguments.backfill_with_context > 0: + if ('replies_count' in post or 'in_reply_to_id' in post) and getattr(arguments, 'backfill_with_context', 0) > 0: parsed_urls = {} parsed = parse_url(post['url'], parsed_urls) if parsed == None: From 4011883ef2f7bb757c151ac2974e2c04a3502803 Mon Sep 17 00:00:00 2001 From: Timothy Quilling Date: Fri, 30 Jun 2023 00:16:59 -0400 Subject: [PATCH 4/9] feat: lemmy-2 --- find_posts.py | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/find_posts.py b/find_posts.py index 22f23db..04c2255 100644 --- a/find_posts.py +++ b/find_posts.py @@ -113,6 +113,37 @@ def get_user_posts(user, know_followings, server): log(f"{user['acct']} is a local user. Skip") know_followings.add(user['acct']) return None + if re.match(r"^https:\/\/[^\/]+\/c\/", user['url']): + try: + url = f"https://{parsed_url[0]}/api/v3/post/list?community_name={parsed_url[1]}&sort=New&limit=50" + response = get(url) + + if(response.status_code == 200): + posts = [post['post'] for post in response.json()['posts']] + for post in posts: + post['url'] = post['ap_id'] + return posts + + except Exception as ex: + log(f"Error getting community posts for community {parsed_url[1]}: {ex}") + return None + + if re.match(r"^https:\/\/[^\/]+\/u\/", user['url']): + try: + url = f"https://{parsed_url[0]}/api/v3/user?username={parsed_url[1]}&sort=New&limit=50" + response = get(url) + + if(response.status_code == 200): + comments = [post['post'] for post in response.json()['comments']] + posts = [post['post'] for post in response.json()['posts']] + all_posts = comments + posts + for post in all_posts: + post['url'] = post['ap_id'] + return all_posts + + except Exception as ex: + log(f"Error getting user posts for user {parsed_url[1]}: {ex}") + return None try: user_id = get_user_id(parsed_url[0], parsed_url[1]) @@ -533,19 +564,15 @@ def parse_pixelfed_profile_url(url): def parse_lemmy_url(url): """parse a Lemmy URL and return the server, and ID""" match = re.match( - r"https://(?P[^/]+)/comment/(?P[^/]+)", url + r"https://(?P[^/]+)/(?:comment|post)/(?P[^/]+)", url ) - if match is None: - match = re.match( - r"https://(?P[^/]+)/post/(?P[^/]+)", url - ) if match is not None: return (match.group("server"), match.group("toot_id")) return None def parse_lemmy_profile_url(url): """parse a Lemmy Profile URL and return the server and username""" - match = re.match(r"https://(?P[^/]+)/u/(?P[^/]+)", url) + match = re.match(r"https://(?P[^/]+)/(?:u|c)/(?P[^/]+)", url) if match is not None: return (match.group("server"), match.group("username")) return None @@ -642,7 +669,7 @@ def get_comment_context(server, toot_id, toot_url): def get_comments_urls(server, post_id, toot_url): """get the URLs of the comments of the given post""" - url = f"https://{server}/api/v3/comment/list?post_id={post_id}" + url = f"https://{server}/api/v3/comment/list?post_id={post_id}&sort=New&limit=50" try: resp = get(url) except Exception as ex: From 4751d96a1d3bbdbd645ea3306343379db53baa13 Mon Sep 17 00:00:00 2001 From: Timothy Quilling Date: Fri, 30 Jun 2023 00:18:39 -0400 Subject: [PATCH 5/9] chore: check none type --- find_posts.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/find_posts.py b/find_posts.py index 04c2255..dffe86f 100644 --- a/find_posts.py +++ b/find_posts.py @@ -396,7 +396,10 @@ def get_all_known_context_urls(server, reply_toots, parsed_urls): url = toot["url"] if toot["reblog"] is None else toot["reblog"]["url"] parsed_url = parse_url(url, parsed_urls) context = get_toot_context(parsed_url[0], parsed_url[1], url) - known_context_urls.update(context) # type: ignore + if context is not None: + known_context_urls.update(context) # type: ignore + else: + log(f"Error getting context for toot {url}") known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls)) log(f"Found {len(known_context_urls)} known context toots") From b04664f9d5311ca6ea6ae65b77dade671540a92d Mon Sep 17 00:00:00 2001 From: Timothy Quilling Date: Fri, 30 Jun 2023 00:21:17 -0400 Subject: [PATCH 6/9] chore: deliminate regex with forward slash --- find_posts.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/find_posts.py b/find_posts.py index dffe86f..fbf4c93 100644 --- a/find_posts.py +++ b/find_posts.py @@ -510,7 +510,7 @@ def parse_url(url, parsed_urls): def parse_mastodon_profile_url(url): """parse a Mastodon Profile URL and return the server and username""" match = re.match( - r"https://(?P.*)/@(?P.*)", url + r"https://(?P[^/]+)/@(?P[^/]+)", url ) if match is not None: return (match.group("server"), match.group("username")) @@ -519,7 +519,7 @@ def parse_mastodon_profile_url(url): def parse_mastodon_url(url): """parse a Mastodon URL and return the server and ID""" match = re.match( - r"https://(?P.*)/@(?P.*)/(?P.*)", url + r"https://(?P[^/]+)/@(?P[^/]+)/(?P[^/]+)", url ) if match is not None: return (match.group("server"), match.group("toot_id")) @@ -528,14 +528,14 @@ def parse_mastodon_url(url): def parse_pleroma_url(url): """parse a Pleroma URL and return the server and ID""" - match = re.match(r"https://(?P.*)/objects/(?P.*)", url) + match = re.match(r"https://(?P[^/]+)/objects/(?P[^/]+)", url) if match is not None: server = match.group("server") url = get_redirect_url(url) if url is None: return None - match = re.match(r"/notice/(?P.*)", url) + match = re.match(r"/notice/(?P[^/]+)", url) if match is not None: return (server, match.group("toot_id")) return None @@ -543,7 +543,7 @@ def parse_pleroma_url(url): def parse_pleroma_profile_url(url): """parse a Pleroma Profile URL and return the server and username""" - match = re.match(r"https://(?P.*)/users/(?P.*)", url) + match = re.match(r"https://(?P[^/]+)/users/(?P[^/]+)", url) if match is not None: return (match.group("server"), match.group("username")) return None @@ -551,7 +551,7 @@ def parse_pleroma_profile_url(url): def parse_pixelfed_url(url): """parse a Pixelfed URL and return the server and ID""" match = re.match( - r"https://(?P.*)/p/(?P.*)/(?P.*)", url + r"https://(?P[^/]+)/p/(?P[^/]+)/(?P[^/]+)", url ) if match is not None: return (match.group("server"), match.group("toot_id")) @@ -559,7 +559,7 @@ def parse_pixelfed_url(url): def parse_pixelfed_profile_url(url): """parse a Pixelfed Profile URL and return the server and username""" - match = re.match(r"https://(?P.*)/(?P.*)", url) + match = re.match(r"https://(?P[^/]+)/(?P[^/]+)", url) if match is not None: return (match.group("server"), match.group("username")) return None From 8edfbc030cc48b092d58d2cbc1d5c1b9cc80e475 Mon Sep 17 00:00:00 2001 From: Timothy Quilling Date: Fri, 30 Jun 2023 01:33:19 -0400 Subject: [PATCH 7/9] chore: access context items safely --- find_posts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/find_posts.py b/find_posts.py index fbf4c93..ac0b318 100644 --- a/find_posts.py +++ b/find_posts.py @@ -397,7 +397,8 @@ def get_all_known_context_urls(server, reply_toots, parsed_urls): parsed_url = parse_url(url, parsed_urls) context = get_toot_context(parsed_url[0], parsed_url[1], url) if context is not None: - known_context_urls.update(context) # type: ignore + for item in context: + known_context_urls.add(item) else: log(f"Error getting context for toot {url}") From 0472fe6e0c4b28fbbb5e237a94ce8775a7fc6f37 Mon Sep 17 00:00:00 2001 From: Timothy Quilling Date: Fri, 30 Jun 2023 01:34:30 -0400 Subject: [PATCH 8/9] fix: match pixelfed profile last --- find_posts.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/find_posts.py b/find_posts.py index ac0b318..4dd7381 100644 --- a/find_posts.py +++ b/find_posts.py @@ -469,11 +469,12 @@ def parse_user_url(url): if match is not None: return match - match = parse_pixelfed_profile_url(url) + match = parse_lemmy_profile_url(url) if match is not None: return match - match = parse_lemmy_profile_url(url) +# Pixelfed profile paths do not use a subdirectory, so we need to match for them last. + match = parse_pixelfed_profile_url(url) if match is not None: return match @@ -493,12 +494,12 @@ def parse_url(url, parsed_urls): parsed_urls[url] = match if url not in parsed_urls: - match = parse_pixelfed_url(url) + match = parse_lemmy_url(url) if match is not None: parsed_urls[url] = match if url not in parsed_urls: - match = parse_lemmy_url(url) + match = parse_pixelfed_url(url) if match is not None: parsed_urls[url] = match From 6f7392cfaa218bb8f271e80e274671124dd359dc Mon Sep 17 00:00:00 2001 From: Timothy Quilling Date: Fri, 30 Jun 2023 01:36:56 -0400 Subject: [PATCH 9/9] feat: fetch root lemmy post --- find_posts.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/find_posts.py b/find_posts.py index 4dd7381..2fb84d6 100644 --- a/find_posts.py +++ b/find_posts.py @@ -661,7 +661,6 @@ def get_comment_context(server, toot_id, toot_url): try: res = resp.json() post_id = res['comment_view']['comment']['post_id'] - log(f"Got parent post ID {post_id} for comment {toot_url}") return get_comments_urls(server, post_id, toot_url) except Exception as ex: log(f"Error parsing context for comment {toot_url}. Exception: {ex}") @@ -674,6 +673,23 @@ def get_comment_context(server, toot_id, toot_url): def get_comments_urls(server, post_id, toot_url): """get the URLs of the comments of the given post""" + urls = [] + url = f"https://{server}/api/v3/post?id={post_id}" + try: + resp = get(url) + except Exception as ex: + log(f"Error getting post {post_id} from {toot_url}. Exception: {ex}") + return [] + + if resp.status_code == 200: + try: + res = resp.json() + if res['post_view']['counts']['comments'] == 0: + return [] + urls.append(res['post_view']['post']['ap_id']) + except Exception as ex: + log(f"Error parsing post {post_id} from {toot_url}. Exception: {ex}") + url = f"https://{server}/api/v3/comment/list?post_id={post_id}&sort=New&limit=50" try: resp = get(url) @@ -686,19 +702,17 @@ def get_comments_urls(server, post_id, toot_url): res = resp.json() list_of_urls = [comment_info['comment']['ap_id'] for comment_info in res['comments']] log(f"Got {len(list_of_urls)} comments for post {toot_url}") - return list_of_urls + urls.extend(list_of_urls) + return urls except Exception as ex: log(f"Error parsing comments for post {toot_url}. Exception: {ex}") - return [] elif resp.status_code == 429: reset = datetime.strptime(resp.headers['x-ratelimit-reset'], '%Y-%m-%dT%H:%M:%S.%fZ') log(f"Rate Limit hit when getting comments for {toot_url}. Waiting to retry at {resp.headers['x-ratelimit-reset']}") time.sleep((reset - datetime.now()).total_seconds() + 1) return get_comments_urls(server, post_id, toot_url) - log( - f"Error getting comments for post {toot_url}. Status code: {resp.status_code}" - ) + log(f"Error getting comments for post {toot_url}. Status code: {resp.status_code}") return [] def add_context_urls(server, access_token, context_urls, seen_urls):