From 9ea4ba05fb0622fd177921e5f2b9fc48c0d1db6a Mon Sep 17 00:00:00 2001 From: nanos Date: Fri, 31 Mar 2023 17:22:35 +0100 Subject: [PATCH] allow us to choose whether to get context when backfilling --- .github/workflows/get_context.yml | 2 +- README.md | 1 + find_posts.py | 14 ++++++++------ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/get_context.yml b/.github/workflows/get_context.yml index 8032405..a326171 100644 --- a/.github/workflows/get_context.yml +++ b/.github/workflows/get_context.yml @@ -33,7 +33,7 @@ jobs: path: artifacts - name: Get Directory structure run: ls -lR - - run: python find_posts.py --lock-hours=0 --access-token=${{ secrets.ACCESS_TOKEN }} --server=${{ vars.MASTODON_SERVER }} --reply-interval-in-hours=${{ vars.REPLY_INTERVAL_IN_HOURS || 0 }} --home-timeline-length=${{ vars.HOME_TIMELINE_LENGTH || 0 }} --max-followings=${{ vars.MAX_FOLLOWINGS || 0 }} --user=${{ vars.USER }} --max-followers=${{ vars.MAX_FOLLOWERS || 0 }} --http-timeout=${{ vars.HTTP_TIMEOUT || 5 }} --max-follow-requests=${{ vars.MAX_FOLLOW_REQUESTS || 0 }} --on-fail=${{ vars.ON_FAIL }} --on-start=${{ vars.ON_START }} --on-done=${{ vars.ON_DONE }} --max-bookmarks=${{ vars.MAX_BOOKMARKS || 0 }} --remember-users-for-hours=${{ vars.REMEMBER_USERS_FOR_HOURS || 168 }} --from-notifications=${{ vars.FROM_NOTIFICATIONS || 0 }} + - run: python find_posts.py --lock-hours=0 --access-token=${{ secrets.ACCESS_TOKEN }} --server=${{ vars.MASTODON_SERVER }} --reply-interval-in-hours=${{ vars.REPLY_INTERVAL_IN_HOURS || 0 }} --home-timeline-length=${{ vars.HOME_TIMELINE_LENGTH || 0 }} --max-followings=${{ vars.MAX_FOLLOWINGS || 0 }} --user=${{ vars.USER }} --max-followers=${{ vars.MAX_FOLLOWERS || 0 }} --http-timeout=${{ vars.HTTP_TIMEOUT || 5 }} --max-follow-requests=${{ vars.MAX_FOLLOW_REQUESTS || 0 }} --on-fail=${{ vars.ON_FAIL }} --on-start=${{ vars.ON_START }} --on-done=${{ vars.ON_DONE }} --max-bookmarks=${{ vars.MAX_BOOKMARKS || 0 }} --remember-users-for-hours=${{ vars.REMEMBER_USERS_FOR_HOURS || 168 }} --from-notifications=${{ vars.FROM_NOTIFICATIONS || 0 }} --backfill-with-context=${{ vars.BACKFILL_WITH_CONTEXT || 0 }} - name: Upload artifacts uses: actions/upload-artifact@v3 with: diff --git a/README.md b/README.md index 66f510c..392a2a6 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,7 @@ Please find the list of all configuration options, including descriptions, below | `MAX_FOLLOWERS` | `--max-followers` | No | Provide to backfill profiles for your most recent followers. Determines how many of your last followers you want to backfill. Recommended value: `80`. | `MAX_FOLLOW_REQUESTS` | `--max-follow-requests` | No | Provide to backfill profiles for the API key owner's most recent pending follow requests. Determines how many of your last follow requests you want to backfill. Recommended value: `80`. | `FROM_NOTIFICATIONS` | `--from-notifications` | No | Provide to backfill profiles of anyone mentioned in your recent notifications. Determines how many hours of notifications you want to look at. Requires an access token with `read:notifications` scope. Recommended value: `1`, unless you run FediFetcher less than once per hour. +|`BACKFILL_WITH_CONTEXT` | `--backfill-with-context` | No | | `REMEMBER_USERS_FOR_HOURS` | `--remember-users-for-hours` | No | How long between back-filling attempts for non-followed accounts? Defaults to `168`, i.e. one week. | `HTTP_TIMEOUT` | `--http-timeout` | No | The timeout for any HTTP requests to the Mastodon API in seconds. Defaults to `5`. | -- | `--lock-hours` | No | Determines after how many hours a lock file should be discarded. Not relevant when running the script as GitHub Action, as concurrency is prevented using a different mechanism. Recommended value: `24`. diff --git a/find_posts.py b/find_posts.py index d019b3f..808741c 100644 --- a/find_posts.py +++ b/find_posts.py @@ -26,6 +26,7 @@ argparser.add_argument('--max-bookmarks', required = False, type=int, default=0, argparser.add_argument('--from-notifications', required = False, type=int, default=0, help="Backfill accounts of anyone appearing in your notifications, during the last hours") argparser.add_argument('--remember-users-for-hours', required=False, type=int, default=24*7, help="How long to remember users that you aren't following for, before trying to backfill them again.") argparser.add_argument('--http-timeout', required = False, type=int, default=5, help="The timeout for any HTTP requests to your own, or other instances.") +argparser.add_argument('--backfill-with-context', required = False, type=bool, default=False, help="Backfill with context?") argparser.add_argument('--lock-hours', required = False, type=int, default=24, help="The lock timeout in hours.") argparser.add_argument('--on-done', required = False, default=None, help="Provide a url that will be pinged when processing has completed. You can use this for 'dead man switch' monitoring of your task") argparser.add_argument('--on-start', required = False, default=None, help="Provide a url that will be pinged when processing is starting. You can use this for 'dead man switch' monitoring of your task") @@ -162,12 +163,13 @@ def add_post_with_context(post, server, access_token, seen_urls): added = add_context_url(post['url'], server, access_token) if added is True: seen_urls.add(post['url']) - parsed_urls = {} - parsed = parse_url(post['url'], parsed_urls) - if parsed == None: - return True - known_context_urls = get_all_known_context_urls(server, [post],parsed_urls) - add_context_urls(server, access_token, known_context_urls, seen_urls) + if arguments.backfill_with_context: + parsed_urls = {} + parsed = parse_url(post['url'], parsed_urls) + if parsed == None: + return True + known_context_urls = get_all_known_context_urls(server, [post],parsed_urls) + add_context_urls(server, access_token, known_context_urls, seen_urls) return True return False