From c1f0e8ac61b58d6f39cf188b4343fe8c61b73a39 Mon Sep 17 00:00:00 2001
From: Timothy Quilling <teqed.arken@gmail.com>
Date: Thu, 29 Jun 2023 00:20:12 -0400
Subject: [PATCH 1/9] feat: lemmy

---
 find_posts.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
diff --git a/find_posts.py b/find_posts.py
index e442d9c..2ad366b 100644
--- a/find_posts.py
+++ b/find_posts.py
@@ -439,6 +439,10 @@ def parse_user_url(url):
     if match is not None:
         return match
 
+    match = parse_lemmy_profile_url(url)
+    if match is not None:
+        return match
+
     log(f"Error parsing Profile URL {url}")
     
     return None
@@ -459,6 +463,11 @@ def parse_url(url, parsed_urls):
         if match is not None:
             parsed_urls[url] = match
 
+    if url not in parsed_urls:
+        match = parse_lemmy_url(url)
+        if match is not None:
+            parsed_urls[url] = match
+
     if url not in parsed_urls:
         log(f"Error parsing toot URL {url}")
         parsed_urls[url] = None
@@ -522,6 +531,25 @@ def parse_pixelfed_profile_url(url):
         return (match.group("server"), match.group("username"))
     return None
 
+def parse_lemmy_url(url):
+    """parse a Lemmy URL and return the server, and ID"""
+    match = re.match(
+        r"https://(?P<server>[^/]+)/comment/(?P<toot_id>[^/]+)", url
+    )
+    if match is None:
+        match = re.match(
+            r"https://(?P<server>[^/]+)/post/(?P<toot_id>[^/]+)", url
+        )
+    if match is not None:
+        return (match.group("server"), match.group("toot_id"))
+    return None
+
+def parse_lemmy_profile_url(url):
+    """parse a Lemmy Profile URL and return the server and username"""
+    match = re.match(r"https://(?P<server>[^/]+)/u/(?P<username>[^/]+)", url)
+    if match is not None:
+        return (match.group("server"), match.group("username"))
+    return None
 
 def get_redirect_url(url):
     """get the URL given URL redirects to"""
@@ -559,6 +587,10 @@ def get_all_context_urls(server, replied_toot_ids):
 
 def get_toot_context(server, toot_id, toot_url):
     """get the URLs of the context toots of the given toot"""
+    if toot_url.find("/comment/") != -1:
+        return get_comment_context(server, toot_id, toot_url)
+    if toot_url.find("/post/") != -1:
+        return get_comments_urls(server, toot_id, toot_url)
     url = f"https://{server}/api/v1/statuses/{toot_id}/context"
     try:
         resp = get(url)
@@ -585,6 +617,58 @@ def get_toot_context(server, toot_id, toot_url):
     )
     return []
 
+def get_comment_context(server, toot_id, toot_url):
+    """get the URLs of the context toots of the given toot"""
+    comment = f"https://{server}/api/v3/comment?id={toot_id}"
+    try:
+        resp = get(comment)
+    except Exception as ex:
+        log(f"Error getting comment {toot_id} from {toot_url}. Exception: {ex}")
+        return []
+    
+    if resp.status_code == 200:
+        try:
+            res = resp.json()
+            post_id = res['comment_view']['comment']['post_id']
+            log(f"Got parent post ID {post_id} for comment {toot_url}")
+            return get_comments_urls(server, post_id, toot_url)
+        except Exception as ex:
+            log(f"Error parsing context for comment {toot_url}. Exception: {ex}")
+        return []
+    elif resp.status_code == 429:
+        reset = datetime.strptime(resp.headers['x-ratelimit-reset'], '%Y-%m-%dT%H:%M:%S.%fZ')
+        log(f"Rate Limit hit when getting context for {toot_url}. Waiting to retry at {resp.headers['x-ratelimit-reset']}")
+        time.sleep((reset - datetime.now()).total_seconds() + 1)
+        return get_comment_context(server, toot_id, toot_url)
+
+def get_comments_urls(server, post_id, toot_url):
+    """get the URLs of the comments of the given post"""
+    url = f"https://{server}/api/v3/comment/list?post_id={post_id}"
+    try:
+        resp = get(url)
+    except Exception as ex:
+        log(f"Error getting comments for post {post_id} from {toot_url}. Exception: {ex}")
+        return []
+
+    if resp.status_code == 200:
+        try:
+            res = resp.json()
+            list_of_urls = [comment_info['comment']['ap_id'] for comment_info in res['comments']]
+            log(f"Got {len(list_of_urls)} comments for post {toot_url}")
+            return list_of_urls
+        except Exception as ex:
+            log(f"Error parsing comments for post {toot_url}. Exception: {ex}")
+        return []
+    elif resp.status_code == 429:
+        reset = datetime.strptime(resp.headers['x-ratelimit-reset'], '%Y-%m-%dT%H:%M:%S.%fZ')
+        log(f"Rate Limit hit when getting comments for {toot_url}. Waiting to retry at {resp.headers['x-ratelimit-reset']}")
+        time.sleep((reset - datetime.now()).total_seconds() + 1)
+        return get_comments_urls(server, post_id, toot_url)
+
+    log(
+        f"Error getting comments for post {toot_url}. Status code: {resp.status_code}"
+    )
+    return []
 
 def add_context_urls(server, access_token, context_urls, seen_urls):
     """add the given toot URLs to the server"""

From b7ef2be02e8633d08267705f354234d5755c046f Mon Sep 17 00:00:00 2001
From: Timothy Quilling <teqed.arken@gmail.com>
Date: Thu, 29 Jun 2023 01:57:33 -0400
Subject: [PATCH 2/9] chore: refactor get_all_known_context_urls

---
 find_posts.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/find_posts.py b/find_posts.py
index 2ad366b..8d5fb3c 100644
--- a/find_posts.py
+++ b/find_posts.py
@@ -356,21 +356,20 @@ def get_reply_toots(user_id, server, access_token, seen_urls, reply_since):
     )
 
 
-def get_all_known_context_urls(server, reply_toots,parsed_urls):
+def get_all_known_context_urls(server, reply_toots, parsed_urls):
     """get the context toots of the given toots from their original server"""
-    known_context_urls = set(
-        filter(
-            lambda url: not url.startswith(f"https://{server}/"),
-            itertools.chain.from_iterable(
-                get_toot_context(*parse_url(toot["url"] if toot["reblog"] is None else toot["reblog"]["url"],parsed_urls), toot["url"])
-                for toot in filter(
-                    lambda toot: toot_has_parseable_url(toot,parsed_urls),
-                    reply_toots
-                )            
-            ),
-        )
-    )
+    known_context_urls = set()
+    
+    for toot in reply_toots:
+        if toot_has_parseable_url(toot, parsed_urls):
+            url = toot["url"] if toot["reblog"] is None else toot["reblog"]["url"]
+            parsed_url = parse_url(url, parsed_urls)
+            context = get_toot_context(parsed_url[0], parsed_url[1], url)
+            known_context_urls.update(context) # type: ignore
+    
+    known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls))
     log(f"Found {len(known_context_urls)} known context toots")
+    
     return known_context_urls
 
 
@@ -1050,4 +1049,4 @@ if __name__ == "__main__":
                 get(f"{arguments.on_fail}?rid={runId}")
             except Exception as ex:
                 log(f"Error getting callback url: {ex}")
-        raise
+        raise
\ No newline at end of file

From e290f2c05f80832c42a6c282d7d16792dab80bf8 Mon Sep 17 00:00:00 2001
From: Timothy Quilling <teqed.arken@gmail.com>
Date: Fri, 30 Jun 2023 00:11:33 -0400
Subject: [PATCH 3/9] chore: use getters

---
 find_posts.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/find_posts.py b/find_posts.py
index 8d5fb3c..22f23db 100644
--- a/find_posts.py
+++ b/find_posts.py
@@ -74,7 +74,7 @@ def add_user_posts(server, access_token, followings, know_followings, all_known_
                 count = 0
                 failed = 0
                 for post in posts:
-                    if post['reblog'] == None and post['url'] != None and post['url'] not in seen_urls:
+                    if post.get('reblog') is None and post.get('url') is not None and post.get('url') not in seen_urls:
                         added = add_post_with_context(post, server, access_token, seen_urls)
                         if added is True:
                             seen_urls.add(post['url'])
@@ -90,7 +90,7 @@ def add_post_with_context(post, server, access_token, seen_urls):
     added = add_context_url(post['url'], server, access_token)
     if added is True:
         seen_urls.add(post['url'])
-        if (post['replies_count'] or post['in_reply_to_id']) and arguments.backfill_with_context > 0:
+        if ('replies_count' in post or 'in_reply_to_id' in post) and getattr(arguments, 'backfill_with_context', 0) > 0:
             parsed_urls = {}
             parsed = parse_url(post['url'], parsed_urls)
             if parsed == None:

From 4011883ef2f7bb757c151ac2974e2c04a3502803 Mon Sep 17 00:00:00 2001
From: Timothy Quilling <teqed.arken@gmail.com>
Date: Fri, 30 Jun 2023 00:16:59 -0400
Subject: [PATCH 4/9] feat: lemmy-2

---
 find_posts.py | 41 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/find_posts.py b/find_posts.py
index 22f23db..04c2255 100644
--- a/find_posts.py
+++ b/find_posts.py
@@ -113,6 +113,37 @@ def get_user_posts(user, know_followings, server):
         log(f"{user['acct']} is a local user. Skip")
         know_followings.add(user['acct'])
         return None
+    if re.match(r"^https:\/\/[^\/]+\/c\/", user['url']):
+        try:
+            url = f"https://{parsed_url[0]}/api/v3/post/list?community_name={parsed_url[1]}&sort=New&limit=50"
+            response = get(url)
+
+            if(response.status_code == 200):
+                posts = [post['post'] for post in response.json()['posts']]
+                for post in posts:
+                    post['url'] = post['ap_id']
+                return posts
+
+        except Exception as ex:
+            log(f"Error getting community posts for community {parsed_url[1]}: {ex}")
+        return None
+    
+    if re.match(r"^https:\/\/[^\/]+\/u\/", user['url']):
+        try:
+            url = f"https://{parsed_url[0]}/api/v3/user?username={parsed_url[1]}&sort=New&limit=50"
+            response = get(url)
+
+            if(response.status_code == 200):
+                comments = [post['post'] for post in response.json()['comments']]
+                posts = [post['post'] for post in response.json()['posts']]
+                all_posts = comments + posts
+                for post in all_posts:
+                    post['url'] = post['ap_id']
+                return all_posts
+            
+        except Exception as ex:
+            log(f"Error getting user posts for user {parsed_url[1]}: {ex}")
+        return None
     
     try:
         user_id = get_user_id(parsed_url[0], parsed_url[1])
@@ -533,19 +564,15 @@ def parse_pixelfed_profile_url(url):
 def parse_lemmy_url(url):
     """parse a Lemmy URL and return the server, and ID"""
     match = re.match(
-        r"https://(?P<server>[^/]+)/comment/(?P<toot_id>[^/]+)", url
+        r"https://(?P<server>[^/]+)/(?:comment|post)/(?P<toot_id>[^/]+)", url
     )
-    if match is None:
-        match = re.match(
-            r"https://(?P<server>[^/]+)/post/(?P<toot_id>[^/]+)", url
-        )
     if match is not None:
         return (match.group("server"), match.group("toot_id"))
     return None
 
 def parse_lemmy_profile_url(url):
     """parse a Lemmy Profile URL and return the server and username"""
-    match = re.match(r"https://(?P<server>[^/]+)/u/(?P<username>[^/]+)", url)
+    match = re.match(r"https://(?P<server>[^/]+)/(?:u|c)/(?P<username>[^/]+)", url)
     if match is not None:
         return (match.group("server"), match.group("username"))
     return None
@@ -642,7 +669,7 @@ def get_comment_context(server, toot_id, toot_url):
 
 def get_comments_urls(server, post_id, toot_url):
     """get the URLs of the comments of the given post"""
-    url = f"https://{server}/api/v3/comment/list?post_id={post_id}"
+    url = f"https://{server}/api/v3/comment/list?post_id={post_id}&sort=New&limit=50"
     try:
         resp = get(url)
     except Exception as ex:

From 4751d96a1d3bbdbd645ea3306343379db53baa13 Mon Sep 17 00:00:00 2001
From: Timothy Quilling <teqed.arken@gmail.com>
Date: Fri, 30 Jun 2023 00:18:39 -0400
Subject: [PATCH 5/9] chore: check none type

---
 find_posts.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/find_posts.py b/find_posts.py
index 04c2255..dffe86f 100644
--- a/find_posts.py
+++ b/find_posts.py
@@ -396,7 +396,10 @@ def get_all_known_context_urls(server, reply_toots, parsed_urls):
             url = toot["url"] if toot["reblog"] is None else toot["reblog"]["url"]
             parsed_url = parse_url(url, parsed_urls)
             context = get_toot_context(parsed_url[0], parsed_url[1], url)
-            known_context_urls.update(context) # type: ignore
+            if context is not None:
+                known_context_urls.update(context) # type: ignore
+            else:
+                log(f"Error getting context for toot {url}")
     
     known_context_urls = set(filter(lambda url: not url.startswith(f"https://{server}/"), known_context_urls))
     log(f"Found {len(known_context_urls)} known context toots")

From b04664f9d5311ca6ea6ae65b77dade671540a92d Mon Sep 17 00:00:00 2001
From: Timothy Quilling <teqed.arken@gmail.com>
Date: Fri, 30 Jun 2023 00:21:17 -0400
Subject: [PATCH 6/9] chore: deliminate regex with forward slash

---
 find_posts.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/find_posts.py b/find_posts.py
index dffe86f..fbf4c93 100644
--- a/find_posts.py
+++ b/find_posts.py
@@ -510,7 +510,7 @@ def parse_url(url, parsed_urls):
 def parse_mastodon_profile_url(url):
     """parse a Mastodon Profile URL and return the server and username"""
     match = re.match(
-        r"https://(?P<server>.*)/@(?P<username>.*)", url
+        r"https://(?P<server>[^/]+)/@(?P<username>[^/]+)", url
     )
     if match is not None:
         return (match.group("server"), match.group("username"))
@@ -519,7 +519,7 @@ def parse_mastodon_profile_url(url):
 def parse_mastodon_url(url):
     """parse a Mastodon URL and return the server and ID"""
     match = re.match(
-        r"https://(?P<server>.*)/@(?P<username>.*)/(?P<toot_id>.*)", url
+        r"https://(?P<server>[^/]+)/@(?P<username>[^/]+)/(?P<toot_id>[^/]+)", url
     )
     if match is not None:
         return (match.group("server"), match.group("toot_id"))
@@ -528,14 +528,14 @@ def parse_mastodon_url(url):
 
 def parse_pleroma_url(url):
     """parse a Pleroma URL and return the server and ID"""
-    match = re.match(r"https://(?P<server>.*)/objects/(?P<toot_id>.*)", url)
+    match = re.match(r"https://(?P<server>[^/]+)/objects/(?P<toot_id>[^/]+)", url)
     if match is not None:
         server = match.group("server")
         url = get_redirect_url(url)
         if url is None:
             return None
         
-        match = re.match(r"/notice/(?P<toot_id>.*)", url)
+        match = re.match(r"/notice/(?P<toot_id>[^/]+)", url)
         if match is not None:
             return (server, match.group("toot_id"))
         return None
@@ -543,7 +543,7 @@ def parse_pleroma_url(url):
 
 def parse_pleroma_profile_url(url):
     """parse a Pleroma Profile URL and return the server and username"""
-    match = re.match(r"https://(?P<server>.*)/users/(?P<username>.*)", url)
+    match = re.match(r"https://(?P<server>[^/]+)/users/(?P<username>[^/]+)", url)
     if match is not None:
         return (match.group("server"), match.group("username"))
     return None
@@ -551,7 +551,7 @@ def parse_pleroma_profile_url(url):
 def parse_pixelfed_url(url):
     """parse a Pixelfed URL and return the server and ID"""
     match = re.match(
-        r"https://(?P<server>.*)/p/(?P<username>.*)/(?P<toot_id>.*)", url
+        r"https://(?P<server>[^/]+)/p/(?P<username>[^/]+)/(?P<toot_id>[^/]+)", url
     )
     if match is not None:
         return (match.group("server"), match.group("toot_id"))
@@ -559,7 +559,7 @@ def parse_pixelfed_url(url):
 
 def parse_pixelfed_profile_url(url):
     """parse a Pixelfed Profile URL and return the server and username"""
-    match = re.match(r"https://(?P<server>.*)/(?P<username>.*)", url)
+    match = re.match(r"https://(?P<server>[^/]+)/(?P<username>[^/]+)", url)
     if match is not None:
         return (match.group("server"), match.group("username"))
     return None

From 8edfbc030cc48b092d58d2cbc1d5c1b9cc80e475 Mon Sep 17 00:00:00 2001
From: Timothy Quilling <teqed.arken@gmail.com>
Date: Fri, 30 Jun 2023 01:33:19 -0400
Subject: [PATCH 7/9] chore: access context items safely

---
 find_posts.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/find_posts.py b/find_posts.py
index fbf4c93..ac0b318 100644
--- a/find_posts.py
+++ b/find_posts.py
@@ -397,7 +397,8 @@ def get_all_known_context_urls(server, reply_toots, parsed_urls):
             parsed_url = parse_url(url, parsed_urls)
             context = get_toot_context(parsed_url[0], parsed_url[1], url)
             if context is not None:
-                known_context_urls.update(context) # type: ignore
+                for item in context:
+                    known_context_urls.add(item)
             else:
                 log(f"Error getting context for toot {url}")
     

From 0472fe6e0c4b28fbbb5e237a94ce8775a7fc6f37 Mon Sep 17 00:00:00 2001
From: Timothy Quilling <teqed.arken@gmail.com>
Date: Fri, 30 Jun 2023 01:34:30 -0400
Subject: [PATCH 8/9] fix: match pixelfed profile last

---
 find_posts.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/find_posts.py b/find_posts.py
index ac0b318..4dd7381 100644
--- a/find_posts.py
+++ b/find_posts.py
@@ -469,11 +469,12 @@ def parse_user_url(url):
     if match is not None:
         return match
 
-    match = parse_pixelfed_profile_url(url)
+    match = parse_lemmy_profile_url(url)
     if match is not None:
         return match
 
-    match = parse_lemmy_profile_url(url)
+# Pixelfed profile paths do not use a subdirectory, so we need to match for them last.
+    match = parse_pixelfed_profile_url(url)
     if match is not None:
         return match
 
@@ -493,12 +494,12 @@ def parse_url(url, parsed_urls):
             parsed_urls[url] = match
 
     if url not in parsed_urls:
-        match = parse_pixelfed_url(url)
+        match = parse_lemmy_url(url)
         if match is not None:
             parsed_urls[url] = match
 
     if url not in parsed_urls:
-        match = parse_lemmy_url(url)
+        match = parse_pixelfed_url(url)
         if match is not None:
             parsed_urls[url] = match
 

From 6f7392cfaa218bb8f271e80e274671124dd359dc Mon Sep 17 00:00:00 2001
From: Timothy Quilling <teqed.arken@gmail.com>
Date: Fri, 30 Jun 2023 01:36:56 -0400
Subject: [PATCH 9/9] feat: fetch root lemmy post

---
 find_posts.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/find_posts.py b/find_posts.py
index 4dd7381..2fb84d6 100644
--- a/find_posts.py
+++ b/find_posts.py
@@ -661,7 +661,6 @@ def get_comment_context(server, toot_id, toot_url):
         try:
             res = resp.json()
             post_id = res['comment_view']['comment']['post_id']
-            log(f"Got parent post ID {post_id} for comment {toot_url}")
             return get_comments_urls(server, post_id, toot_url)
         except Exception as ex:
             log(f"Error parsing context for comment {toot_url}. Exception: {ex}")
@@ -674,6 +673,23 @@ def get_comment_context(server, toot_id, toot_url):
 
 def get_comments_urls(server, post_id, toot_url):
     """get the URLs of the comments of the given post"""
+    urls = []
+    url = f"https://{server}/api/v3/post?id={post_id}"
+    try:
+        resp = get(url)
+    except Exception as ex:
+        log(f"Error getting post {post_id} from {toot_url}. Exception: {ex}")
+        return []
+
+    if resp.status_code == 200:
+        try:
+            res = resp.json()
+            if res['post_view']['counts']['comments'] == 0:
+                return []
+            urls.append(res['post_view']['post']['ap_id'])
+        except Exception as ex:
+            log(f"Error parsing post {post_id} from {toot_url}. Exception: {ex}")
+
     url = f"https://{server}/api/v3/comment/list?post_id={post_id}&sort=New&limit=50"
     try:
         resp = get(url)
@@ -686,19 +702,17 @@ def get_comments_urls(server, post_id, toot_url):
             res = resp.json()
             list_of_urls = [comment_info['comment']['ap_id'] for comment_info in res['comments']]
             log(f"Got {len(list_of_urls)} comments for post {toot_url}")
-            return list_of_urls
+            urls.extend(list_of_urls)
+            return urls
         except Exception as ex:
             log(f"Error parsing comments for post {toot_url}. Exception: {ex}")
-        return []
     elif resp.status_code == 429:
         reset = datetime.strptime(resp.headers['x-ratelimit-reset'], '%Y-%m-%dT%H:%M:%S.%fZ')
         log(f"Rate Limit hit when getting comments for {toot_url}. Waiting to retry at {resp.headers['x-ratelimit-reset']}")
         time.sleep((reset - datetime.now()).total_seconds() + 1)
         return get_comments_urls(server, post_id, toot_url)
 
-    log(
-        f"Error getting comments for post {toot_url}. Status code: {resp.status_code}"
-    )
+    log(f"Error getting comments for post {toot_url}. Status code: {resp.status_code}")
     return []
 
 def add_context_urls(server, access_token, context_urls, seen_urls):