Compare commits

..

167 commits
v2.0.0 ... main

Author SHA1 Message Date
Michael
205b0731db
Merge pull request #103 from likeazir/patch-1
Add sharkey as misskey-api-capable
2024-03-04 07:14:45 +00:00
Jonas
bf0ed943ec
add sharkey as misskey-api-capable 2024-03-02 09:56:41 +00:00
Michael
f5c1033fc9
Merge pull request #102 from benyafai/main
🐳 docker-compose
2024-02-28 17:07:35 +00:00
Ben Yafai
ca302bb8db 🐳 docker-compose 2024-02-28 17:05:09 +00:00
Michael
34d07a4fa1
Merge pull request #95 from nanos/update-for-node-20
Update build-container.yaml
2024-02-02 07:25:44 +00:00
Michael
e86863a8ae
Update build-container.yaml 2024-02-02 07:25:08 +00:00
Michael
e4fca0d67e
Merge pull request #94 from nanos/node-16-depracation
Update some actions to use Node 20, now that Node 16 is deprecated
2024-02-02 07:05:45 +00:00
Michael
fe1c69f3ba
Update get_context.yml
Update upload-artifact
2024-01-30 21:10:29 +00:00
Michael
0416cc159a
Update get_context.yml
update correct line
2024-01-26 16:32:54 +00:00
Michael
52d3b8d9e9
Update get_context.yml
Update 2nd ceckout too
2024-01-26 16:31:22 +00:00
Michael
3d8ab95f11
Update get_context.yml
Update action for Node 16 deprecation (#92)
2024-01-26 15:52:45 +00:00
Michael
a8dc809787
Merge pull request #90 from himynameisjonas/patch-1
Build docker image for arm64 as well
2023-12-18 10:19:11 +00:00
Jonas Brusman
099ef7d37a
Build docker image for arm64 as well
Makes it possible to run it on a RaspberryPi
2023-12-16 16:07:36 +01:00
Michael
f69eaed5a6
Merge pull request #88 from zotanmew/main
Add support for Iceshrimp
2023-10-24 13:23:15 +01:00
Laura Hausmann
7be5dfb9b1
Add support for Iceshrimp 2023-10-21 23:41:05 +02:00
nanos
95b644d431 Define nodeLoc (fixes #82) 2023-09-07 08:43:10 +01:00
Michael
bed11e83f1
Merge pull request #80 from lhoBas/fix/k8s-cronjob
examples/k8s-cronjob.yaml: fix job naming
2023-08-31 08:33:12 +01:00
Bas
dafaf93d50
examples/k8s-cronjob.yaml: fix job naming
Fixes validation errors upon applying the k8s manifest:

```
The CronJob "FediFetcher" is invalid:
* metadata.name: Invalid value: "FediFetcher": a lowercase RFC 1123 subdomain must consist of lower case alphanumeric characters, '-' or '.', and must start and end with an alphanumeric character (e.g. 'example.com', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*')
* spec.jobTemplate.spec.template.spec.containers[0].name: Invalid value: "FediFetcher": a lowercase RFC 1123 label must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc', regex used for validation is '[a-z0-9]([-a-z0-9]*[a-z0-9])?')
```
2023-08-18 10:49:22 +02:00
nanos
31f475dcdd fixes #79 2023-08-18 08:00:09 +01:00
Michael
a76b52642d
Merge pull request #71 from ToadKing/retry-cleanup
remove redundant code for retrying on HTTP 429
2023-08-14 15:49:14 +01:00
Michael
0744caad6f
Merge pull request #75 from YoannMa/fixLog
fix bug when failing to get user's posts
2023-08-06 22:42:45 +01:00
Yoann MALLEMANCHE
adc0d4ec4e
fix bug when failing to get user's posts 2023-08-06 17:54:34 +02:00
nanos
253c7c4f2b Revert "print current version on startup" (#70)
This reverts commit 213ef57abe.
2023-08-06 09:45:33 +01:00
Toad King
db2dcce2ff remove redundant code for retrying on HTTP 429 2023-08-05 11:44:48 -05:00
nanos
712d88cf0d Update list of supported servers 2023-08-05 09:29:49 +01:00
Michael
ffa6617fff
Merge pull request #66 from ToadKing/misskey
add support for misskey and calckey/firefish
2023-08-05 09:04:08 +01:00
nanos
e207bb6435 document remember-hosts-for-days 2023-08-03 15:23:23 +01:00
nanos
c90a7e42ab fix incorrect rate limit handling for mastodon 2023-08-03 15:21:59 +01:00
nanos
3294a44f76 cache host failures for a shorter time 2023-08-03 15:21:12 +01:00
nanos
174448a2b0 fix bug in writing file to disk 2023-08-03 10:54:14 +01:00
nanos
ae55c96506 improve caching to provide timeout 2023-08-03 10:53:17 +01:00
nanos
5a2b2c2311 Save seen_hosts on disk 2023-08-03 08:01:42 +01:00
nanos
179bb65253 move note regarding access token further up 2023-08-01 16:13:06 +01:00
Toad King
a7611c6e6f remove GoToSocial from Mastodon API support list
https://github.com/superseriousbusiness/gotosocial/issues/2038
2023-07-31 17:10:29 -05:00
Toad King
80ee1387f7 hook up host software lookup to user/post fetching
add depth fetching for misskey posts (works in firefish and foundkey)
2023-07-30 03:19:47 -05:00
Toad King
c92d4e1c2c start on server software detection 2023-07-26 01:14:06 -05:00
Toad King
c169b2ae30 add hacky support for misskey and calckey/firefish 2023-07-25 01:02:41 -05:00
nanos
213ef57abe print current version on startup (#58) 2023-07-03 08:46:20 +01:00
Michael
4dc41ee02c
Merge pull request #61 from Teqed/fix/crash-on-zero-notifications
fix: handle zero notifications
2023-07-03 07:02:16 +01:00
Timothy Quilling
47e8b485a5 fix: handle zero notifications 2023-07-02 23:54:29 -04:00
nanos
93d5b503af Update to indicate Lemmy support 2023-07-01 20:47:25 +01:00
Michael
f7d015004e
Merge pull request #56 from Teqed/feat/lemmy
feat: lemmy
2023-07-01 20:43:58 +01:00
Timothy Quilling
6f7392cfaa feat: fetch root lemmy post 2023-06-30 01:36:56 -04:00
Timothy Quilling
0472fe6e0c fix: match pixelfed profile last 2023-06-30 01:34:30 -04:00
Timothy Quilling
8edfbc030c chore: access context items safely 2023-06-30 01:33:19 -04:00
Timothy Quilling
d212e7a8a3
feat: lemmy communities and users 2023-06-30 00:26:52 -04:00
Timothy Quilling
b04664f9d5 chore: deliminate regex with forward slash 2023-06-30 00:21:17 -04:00
Timothy Quilling
4751d96a1d chore: check none type 2023-06-30 00:18:39 -04:00
Timothy Quilling
4011883ef2 feat: lemmy-2 2023-06-30 00:16:59 -04:00
Timothy Quilling
e290f2c05f chore: use getters 2023-06-30 00:11:33 -04:00
Timothy Quilling
b7ef2be02e chore: refactor get_all_known_context_urls 2023-06-29 01:57:33 -04:00
Timothy Quilling
c1f0e8ac61 feat: lemmy 2023-06-29 00:20:12 -04:00
nanos
535bf1f404 update readme 2023-06-15 11:17:25 +01:00
nanos
12fbd0ed72 check out own fork when running GH action (fixes #52) 2023-06-15 11:12:18 +01:00
nanos
9fec312b38 documentation updates 2023-06-15 09:11:45 +01:00
nanos
9edbee7285 Deal better with it, when someone provides a URL instead of a server name (#50, #45) 2023-06-15 08:00:05 +01:00
Michael
3620b4944b
Merge pull request #51 from nanos/improve-config
Improve config
2023-06-15 07:25:45 +01:00
nanos
8168aa8036 minor fixes 2023-06-14 10:49:15 +01:00
nanos
bf1b73bc04 Improve and unify configuration options (#49) 2023-06-14 10:41:30 +01:00
nanos
46a5be98df Fix bug in providing access token as config where it isn't being recognised as array 2023-06-14 09:51:56 +01:00
nanos
5a3db443cb Allow us to pass configuration options in a json file
Addresses #48
2023-06-09 21:55:09 +01:00
nanos
d4dfa1e315 Wrap URLs in quotes. This fixes #46 2023-06-01 21:35:16 +01:00
Michael
c7e0555394
Update README 2023-04-28 20:57:32 +01:00
Michael
5f6ef2646a
Merge pull request #43 from nanos/dev
support for fetching context to favourites (#42)
2023-04-28 20:49:45 +01:00
nanos
1fffddcb23 support for fetching context to favourites (#42) 2023-04-28 20:40:24 +01:00
nanos
311353348c readme updates 2023-04-21 10:48:39 +01:00
nanos
080c5dfb78 add link 2023-04-11 09:17:59 +01:00
nanos
3ae3be9184 Add a note on supported servers 2023-04-10 16:28:22 +01:00
nanos
8235cda859 support for multiple access tokens 2023-04-10 16:28:09 +01:00
nanos
d0cb212315 fix silly comment 🤦‍♂️ 2023-04-06 09:13:30 +01:00
nanos
bfbffc0773 Fix backfill options [#37] 2023-04-06 08:00:28 +01:00
nanos
b9d82dc073 Fixes missing arguments (#35) 2023-04-05 09:15:24 +01:00
Michael
886a0cecd2
Merge pull request #29 from arachnist/configurable-state-location
make the state location configurable
2023-04-05 07:49:35 +01:00
nanos
12ba458563 lowercase container name (#34) 2023-04-05 07:35:05 +01:00
nanos
a3f2b23022 dont attempt to backfill reblogs, as we cannot 2023-04-04 17:00:35 +01:00
nanos
9f28ba2333 bug fixes 2023-04-03 08:57:31 +01:00
nanos
2b707e7807 some refactoring 2023-04-03 08:36:09 +01:00
Robert Gerus
73a296c310 make the state file configurable 2023-04-01 16:03:43 +02:00
nanos
56039cfdea only request context to a toot, if we know context exists 2023-04-01 08:44:12 +01:00
nanos
f51d19730b put a cap on mentions to backfill during any given executions 2023-03-31 17:23:12 +01:00
nanos
9ea4ba05fb allow us to choose whether to get context when backfilling 2023-03-31 17:22:35 +01:00
nanos
785499ab82 do make sure we also backfill users from reblogs 2023-03-31 10:23:47 +01:00
nanos
a357470328 also backfill post authors 2023-03-31 09:05:51 +01:00
nanos
5f438ee873 backfill any user mentioned in the home timeline 2023-03-31 09:05:43 +01:00
nanos
dc0d94a274 Add Context when adding posts 2023-03-31 09:05:29 +01:00
nanos
0c87cd6727 Allow us to parse Pixelfed URLs
fixes #30
2023-03-31 07:51:18 +01:00
nanos
44f15de367 readme updates 2023-03-28 09:38:48 +01:00
nanos
871cfdeab6 Added a note re .pyw and Windows as suggested in #27 2023-03-28 09:34:56 +01:00
nanos
e40a5447ee Add a sample bash script 2023-03-28 07:04:31 +01:00
nanos
9b7093e478 Allow us to backfill posts from users that are mentioned in notifications 2023-03-28 07:04:03 +01:00
Michael
c5208568b5
Update k8s-cronjob.yaml
Must be lower case
2023-03-27 09:51:41 +01:00
nanos
87fd32eb9d bug fix 2023-03-27 07:25:01 +01:00
nanos
1e7aafa6b2 Merge branch 'shiruken-main'
# Conflicts:
#	.github/workflows/get_context.yml
2023-03-27 07:15:54 +01:00
nanos
a8cf5d3eef Rename to FediFetcher 2023-03-27 07:15:08 +01:00
nanos
ee045ab493 make sure we follow redirects 2023-03-27 07:15:08 +01:00
nanos
3b9eecce08 Rename to FediFetcher 2023-03-27 07:02:59 +01:00
nanos
b129c9445c make sure we follow redirects 2023-03-23 19:59:08 +00:00
Colin Sullender
58c064db0f
Checkout user repository for keep alive 2023-03-23 14:26:48 -04:00
Colin Sullender
0c73962fb4
Simplify checking out latest release 2023-03-23 09:11:27 -04:00
Michael
b1a382c55a
Merge pull request #14 from nikdoof/main
Update Kubernetes example and README for persistence
2023-03-23 07:13:50 +00:00
Michael
bbc2791e7c
Merge pull request #22 from shiruken/main
Create artifacts directory in docker image
2023-03-23 07:12:22 +00:00
Michael
7eb708d4e5
Merge branch 'main' into main 2023-03-23 07:11:52 +00:00
nanos
6953672df4 use curl's retry mechanism when getting the release
addresses #25
2023-03-23 07:09:30 +00:00
nanos
e2b4addcfc Try again to keep workflow alive
Fixes #24
2023-03-23 07:06:14 +00:00
nanos
3491c6a7e2 don't attempt to send beacon if urls are empty string 2023-03-22 11:40:10 +00:00
Michael
0d95631980
Disable keepalive
This is causing an error (#23) because we aren't actually checking out the repo.

Need to revisit at a later stage.
2023-03-22 07:24:47 +00:00
Colin Sullender
ddabff2d30
Disable keepalive-workflow 2023-03-21 19:46:17 -04:00
Colin Sullender
65807e7842
Create artifacts directory in docker image 2023-03-21 19:26:01 -04:00
nanos
afc981930c Implement option to get replies to bookmarks (#19) 2023-03-21 19:23:14 +00:00
nanos
c587e0907a finalise callbacks 2023-03-21 14:42:31 +00:00
nanos
38231cc4f6 fix how we deal with failure to get timeline toots 2023-03-21 14:22:19 +00:00
nanos
79b69e7fdc also add a callback for process start 2023-03-21 11:56:34 +00:00
nanos
586888c761 implement callback upon completion
This fixes #20
2023-03-21 10:27:43 +00:00
nanos
93e234736c edit 2023-03-21 08:47:44 +00:00
nanos
1d366b0f86 Prevent workflow from getting disabled (#17) 2023-03-21 08:43:57 +00:00
nanos
200d3a155d forgot to remove lock file 2023-03-21 08:40:22 +00:00
nanos
12bde97687 allow us to set lock-hours to 0 to prevent the action getting stuck (#18) 2023-03-21 08:28:29 +00:00
nanos
917ff8354d Document locking for #18 2023-03-21 08:26:48 +00:00
nanos
f8979ed578 fix default max-follow-requests 2023-03-21 08:18:23 +00:00
nanos
43c04cf05f Improve handling of lock (see #18) 2023-03-21 08:17:03 +00:00
nanos
27bfcc3841 Improved output 2023-03-21 07:55:52 +00:00
nanos
62989f4041 Use file based locking when running script locally
Fixes #18
2023-03-20 21:15:21 +00:00
nanos
5be011b8d9 Fix bug in getting follow requests 2023-03-20 21:15:21 +00:00
nanos
f9600844f8
Update README.md 2023-03-20 18:04:40 +00:00
nanos
3c500d6080
Update README.md 2023-03-20 16:19:43 +00:00
nanos
76f7c22d73 backfill follow requests' profiles (addresses #7) 2023-03-20 09:12:16 +00:00
nanos
252e648ec8
Merge pull request #15 from nikdoof/rate_limit_fix
Fix var naming conflict with argparse and date parser
2023-03-19 20:56:05 +00:00
Andrew Williams
429a16667c
Fix var naming conflict with argparse and date parser 2023-03-19 13:37:55 +00:00
Andrew Williams
5e7fbba0d0
Add more arguments to the example 2023-03-19 13:19:53 +00:00
Andrew Williams
f8c9cf7281
Update Kubernetes example and README for persistence 2023-03-19 13:07:31 +00:00
nanos
fc7d2d0bbc typofix 2023-03-17 11:55:11 +00:00
Michael Thomas
cf8b98ae78 silly bugfix 2023-03-17 08:42:20 +00:00
Michael Thomas
a624f1ae58 Allow us to make the HTTP timeout configurable 2023-03-17 08:26:38 +00:00
Michael Thomas
9a5649e0b3 improve configuration options 2023-03-17 08:14:47 +00:00
nanos
a7fc312b6c
Update build-container.yaml 2023-03-17 07:46:49 +00:00
nanos
27415bdff6
Merge pull request #13 from nikdoof/main
Add Dockerfile and container build workflow
2023-03-17 07:15:55 +00:00
Andrew Williams
1676885ed5
Update README and add example CronJob definition 2023-03-17 06:54:16 +00:00
Andrew Williams
2f937d6ce9
Add Dockerfile and container build workflow 2023-03-16 16:23:08 +00:00
nanos
5fa7fe2b46
Merge pull request #10 from cassidyjames/patch-1
getAllRepliesToKnownPots → getAllRepliesToKnownPosts
2023-03-16 08:59:46 +00:00
Michael Thomas
281ac19c19 change action 2023-03-16 08:54:26 +00:00
Michael Thomas
837043bbe0 edits for readability 2023-03-16 08:50:28 +00:00
Michael Thomas
c31ce64ab7 update readme 2023-03-16 08:48:20 +00:00
Michael Thomas
90e6ac39b2 use the new find_posts, with get_context being used as fallback 2023-03-16 08:25:24 +00:00
Michael Thomas
982d3b3325 use named arguments 2023-03-16 08:02:07 +00:00
Cassidy James Blaede
d0329ecbe5
getAllRepliesToKnownPots → getAllRepliesToKnownPosts
I figure this was a typo?
2023-03-15 22:17:34 -06:00
Michael Thomas
7165154570 Fix for missing mentions
Fixes #9
2023-03-15 16:42:42 +00:00
Michael Thomas
ffc80a5081 Be timezone aware in processing ratelimits 2023-03-15 07:20:49 +00:00
Michael Thomas
ce69c2a033 improve output for debugging 2023-03-14 21:13:48 +00:00
Michael Thomas
248542d7a6 add the stuff required to pull in followers, 2023-03-14 10:23:05 +00:00
Michael Thomas
ddfa7cf639 Allow us to get recent followers' posts as well 2023-03-14 10:19:25 +00:00
nanos
328238bcc7
Update README.md 2023-03-14 07:59:45 +00:00
nanos
d5949acb98
Update README.md 2023-03-14 07:49:30 +00:00
nanos
263751a1b9
Update README.md
Add a note on how to run this script locally
2023-03-14 07:31:21 +00:00
nanos
b0af586bd0
Update README.md 2023-03-13 13:51:57 +00:00
nanos
0e97751d54
Update README.md 2023-03-13 13:51:45 +00:00
Michael Thomas
3079fb088e better edits 2023-03-13 13:24:19 +00:00
Michael Thomas
174a6731a3 make ul 2023-03-13 13:22:55 +00:00
Michael Thomas
13b13fd916 udpate readme 2023-03-13 13:22:20 +00:00
Michael Thomas
022a01f3e8 fix formatting 2023-03-13 12:37:24 +00:00
Michael Thomas
a628fcfab3 update readme and action 2023-03-13 12:36:00 +00:00
Michael Thomas
7b265fa6a1 Use a proper wrapper around all the mastodon requests, to ensure they all honour the rate limits, and pass on the correct UA 2023-03-13 12:24:46 +00:00
Michael Thomas
974c7500ff Bit of tidying up 2023-03-13 12:22:15 +00:00
Michael Thomas
382b06abbb Add option to backfill recent followings' posts 2023-03-13 11:00:24 +00:00
Michael Thomas
23b7275907 better link 2023-03-11 08:50:36 +00:00
nanos
3873edf4f6
Update README.md 2023-03-11 07:58:53 +00:00
Michael Thomas
5ed5637173 Provide a User Agent when querying the Mastodon API 2023-03-10 12:30:28 +00:00
Michael Thomas
8f5fff289d fix the bug for seen_urls 2023-03-09 10:14:12 +00:00
12 changed files with 1838 additions and 532 deletions

31
.github/workflows/build-container.yaml vendored Normal file
View file

@ -0,0 +1,31 @@
name: Build Container
"on":
push:
tags:
- "v[0-9]+.[0-9]+.[0-9]+"
jobs:
docker:
runs-on: ubuntu-latest
steps:
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GHCR
uses: docker/login-action@v3
if: github.event_name != 'pull_request'
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push
id: docker_build
uses: docker/build-push-action@v5
with:
push: true
platforms: linux/amd64,linux/arm64
tags: |
ghcr.io/${{ github.repository_owner }}/fedifetcher:${{ github.ref_name }}
ghcr.io/${{ github.repository_owner }}/fedifetcher:latest

View file

@ -1,4 +1,4 @@
name: getAllRepliesToKnownPots
name: getAllRepliesToKnownPosts
concurrency: get_context
on:
@ -11,20 +11,18 @@ jobs:
runs-on: ubuntu-latest
environment: mastodon
steps:
- name: Get latest release
run: |
curl -s https://api.github.com/repos/nanos/mastodon_get_replies/releases/latest | jq .zipball_url | xargs wget -O download.zip
unzip -j download.zip
mkdir artifacts
ls -lR
- name: Checkout original repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: '3.10'
cache: 'pip' # caching pip dependencies
- run: pip install -r requirements.txt
- name: Download all workflow run artifacts
uses: dawidd6/action-download-artifact@v2
uses: dawidd6/action-download-artifact@v3
with:
name: artifacts
workflow: get_context.yml
@ -32,10 +30,14 @@ jobs:
path: artifacts
- name: Get Directory structure
run: ls -lR
- run: python get_context.py ${{ secrets.ACCESS_TOKEN }} ${{ vars.MASTODON_SERVER }} ${{ vars.REPLY_INTERVAL_IN_HOURS }} ${{ vars.HOME_TIMELINE_LENGTH }}
- run: python find_posts.py --lock-hours=0 --access-token=${{ secrets.ACCESS_TOKEN }} -c="./config.json"
- name: Upload artifacts
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: artifacts
path: |
artifacts
- name: Checkout user's forked repository for keeping workflow alive
uses: actions/checkout@v4
- name: Keep workflow alive
uses: gautamkrishnar/keepalive-workflow@v1

3
.gitignore vendored
View file

@ -1,3 +1,2 @@
.vscode/launch.json
artifacts/replied_toot_server_ids
artifacts/seen_urls
artifacts/*

7
Dockerfile Normal file
View file

@ -0,0 +1,7 @@
FROM python:3.11-alpine
WORKDIR /app
COPY ./requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
RUN mkdir -p /app/artifacts/
COPY ./find_posts.py /app/
ENTRYPOINT ["python", "find_posts.py"]

172
README.md
View file

@ -1,43 +1,175 @@
# Pull missing responses into Mastodon
# FediFetcher for Mastodon
This GitHub repository provides a GitHub action runs every 10 mins, and pulls remote replies into your instance, using the Mastodon API. It has two parts:
This GitHub repository provides a simple script that can pull missing posts into Mastodon using the Mastodon API. FediFetcher has no further dependencies, and can be run as either a GitHub Action, as a scheduled cron job, or a pre-packaged container. Here is what FediFetcher can do:
1. It gets remote replies to posts that users on your instance have already replied to during the last `REPLY_INTERVAL_IN_HOURS` hours, and adds them to your own server.
2. It gets remote replies to the last `HOME_TIMELINE_LENGTH` posts from your home timeline, and adds them to your own server.
1. It can pull missing remote replies to posts that are already on your server into your server. Specifically, it can
1. fetch missing replies to posts that users on your instance have already replied to,
2. fetch missing replies to the most recent posts in your home timeline,
3. fetch missing replies to your bookmarks.
4. fetch missing replies to your favourites.
2. It can also backfill profiles on your instance. In particular it can
1. fetch missing posts from users that have recently appeared in your notifications,
1. fetch missing posts from users that you have recently followed,
2. fetch missing posts form users that have recently followed you,
3. fetch missing posts form users that have recently sent you a follow request.
Either part can be disabled completely, and the values for `REPLY_INTERVAL_IN_HOURS` and `HOME_TIMELINE_LENGTH` are configurable (see below).
Each part of this script is fully configurable, and you can completely disable parts that you are not interested in.
**Be aware, that this script may run for a long time, if these values are too high.** Experiment a bit with what works for you, by starting with fairly small numbers (maybe `HOME_TIMELINE_LENGTH = 50`, `REPLY_INTERVAL_IN_HOURS = 12`) and increase the numbers as you see fit.
FediFetcher will store posts and profiles it has already pulled in on disk, to prevent re-fetching the same info in subsequent executions.
**For full context and discussion on why this is needed, read the blog post: [Pull missing responses into Mastodon](https://blog.thms.uk/2023/03/pull-missing-responses-into-mastodon?utm_source=github)**
**Be aware, that this script may run for a *very* long time.** This is particularly true, the first time this script runs, and/or if you enable all parts of this script. You should ensure that you take steps to prevent multiple overlapping executions of this script, as that will lead to unpleasant results. There are detailed instructions for this below.
For detailed information on the how and why, please read the [FediFetcher for Mastodon page](https://blog.thms.uk/fedifetcher?utm_source=github).
## Supported servers
FediFetcher makes use of the Mastodon API. It'll run against any instance implementing this API, and whilst it was built for Mastodon, it's been [confirmed working against Pleroma](https://fed.xnor.in/objects/6bd47928-704a-4cb8-82d6-87471d1b632f) as well.
FediFetcher will pull in posts and profiles from any servers running the following software: Mastodon, Pleroma, Akkoma, Pixelfed, Hometown, Misskey, Firefish (Calckey), Foundkey, and Lemmy.
## Setup
You can run FediFetcher either as a GitHub Action, as a scheduled cron job on your local machine/server, or from a pre-packed container.
### 1) Get the required access token:
Regardless of how you want to run FediFetcher, you must first get an access token:
#### If you are an Admin on your instance
1. In Mastodon go to Preferences > Development > New Application
1. give it a nice name
2. enable `read:search`, `read:statuses` and `admin:read:accounts `
1. Give it a nice name
2. Enable the required scopes for your options. You could tick `read` and `admin:read:accounts`, or see below for a list of which scopes are required for which options.
3. Save
4. Copy the value of `Your access token`
### 2) Configure and run the GitHub action
#### If you are not an Admin on your Instance
1. Go to [GetAuth for Mastodon](https://getauth.thms.uk?scopes=read&client_name=FediFetcher)
2. Type in your Mastodon instance's domain
3. Copy the token.
### 2) Configure and run FediFetcher
Run FediFetcher as a GitHub Action, a cron job, or a container:
#### To run FediFetcher as a GitHub Action:
1. Fork this repository
2. Add your access token:
1. Go to Settings > Secrets and Variables > Actions
2. Click New Repository Secret
3. Supply the Name `ACCESS_TOKEN` and provide the Token generated above as Secret
3. Provide the required environment variables, to configure your Action:
1. Go to Settings > Environments
2. Click New Environment
3. Provide the name `Mastodon`
4. Add the following Environment Variables:
- `MASTODON_SERVER`: The domain only of your mastodon server (without `https://` prefix) e.g. `mstdn.thms.uk`
- `HOME_TIMELINE_LENGTH`: An integer number. E.g. `200`. (See above for explanation.) Set to `0` to disable this part of the script.
- `REPLY_INTERVAL_IN_HOURS`: An integer number. E.g. `24`. (See above for explanation). Set to `0` to disable this part of the script.
4. Finally go to the Actions tab and enable the action.
3. Create a file called `config.json` with your [configuration options](#configuration-options) in the repository root. **Do NOT include the Access Token in your `config.json`!**
4. Finally go to the Actions tab and enable the action. The action should now automatically run approximately once every 10 min.
> **Note**
>
> Keep in mind that [the schedule event can be delayed during periods of high loads of GitHub Actions workflow runs](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule).
#### To run FediFetcher as a cron job:
1. Clone this repository.
2. Install requirements: `pip install -r requirements.txt`
3. Create a `json` file with [your configuration options](#configuration-options). You may wish to store this in the `./artifacts` directory, as that directory is `.gitignore`d
4. Then simply run this script like so: `python find_posts.py -c=./artifacts/config.json`.
If desired, all configuration options can be provided as command line flags, instead of through a JSON file. An [example script](./examples/FediFetcher.sh) can be found in the `examples` folder.
When using a cronjob, we are using file based locking to avoid multiple overlapping executions of the script. The timeout period for the lock can be configured using `lock-hours`.
> **Note**
>
> If you are running FediFetcher locally, my recommendation is to run it manually once, before turning on the cron job: The first run will be significantly slower than subsequent runs, and that will help you prevent overlapping during that first run.
#### To run FediFetcher from a container:
FediFetcher is also available in a pre-packaged container, [FediFetcher](https://github.com/nanos/FediFetcher/pkgs/container/fedifetcher) - Thank you [@nikdoof](https://github.com/nikdoof).
1. Pull the container from `ghcr.io`, using Docker or your container tool of choice: `docker pull ghcr.io/nanos/fedifetcher:latest`
2. Run the container, passing the configurations options as command line arguments: `docker run -it ghcr.io/nanos/fedifetcher:latest --access-token=<TOKEN> --server=<SERVER>`
> **Note**
>
> The same rules for running this as a cron job apply to running the container: don't overlap any executions.
Persistent files are stored in `/app/artifacts` within the container, so you may want to map this to a local folder on your system.
An [example Kubernetes CronJob](./examples/k8s-cronjob.yaml) for running the container is included in the `examples` folder.
An [example Docker Compose Script](./examples/docker-compose.yaml) for running the container periodically is included in the `examples` folder.
### Configuration options
FediFetcher has quite a few configuration options, so here is my quick configuration advice, that should probably work for most people:
> **Warning**
>
> **Do NOT** include your `access-token` in the `config.json` when running FediFetcher as GitHub Action. When running FediFetcher as GitHub Action **ALWAYS** [set the Access Token as an Action Secret](#to-run-fedifetcher-as-a-github-action).
```json
{
"access-token": "Your access token",
"server": "your.mastodon.server",
"home-timeline-length": 200,
"max-followings": 80,
"from-notifications": 1
}
```
If you configure FediFetcher this way, it'll fetch missing remote replies to the last 200 posts in your home timeline. It'll additionally backfill profiles of the last 80 people you followed, and of every account who appeared in your notifications during the past hour.
#### Advanced Options
Please find the list of all configuration options, including descriptions, below:
Option | Required? | Notes |
|:----------------------------------------------------|-----------|:------|
|`access-token` | Yes | The access token. If using GitHub action, this needs to be provided as a Secret called `ACCESS_TOKEN`. If running as a cron job or a container, you can supply this option as array, to [fetch posts for multiple users](https://blog.thms.uk/2023/04/muli-user-support-for-fedifetcher) on your instance. |
|`server`|Yes|The domain only of your mastodon server (without `https://` prefix) e.g. `mstdn.thms.uk`. |
|`home-timeline-length` | No | Provide to fetch remote replies to posts in the API-Key owner's home timeline. Determines how many posts we'll fetch replies for. Recommended value: `200`.
| `max-bookmarks` | No | Provide to fetch remote replies to any posts you have bookmarked. Determines how many of your bookmarks you want to get replies to. Recommended value: `80`. Requires an access token with `read:bookmarks` scope.
| `max-favourites` | No | Provide to fetch remote replies to any posts you have favourited. Determines how many of your favourites you want to get replies to. Recommended value: `40`. Requires an access token with `read:favourites` scope.
| `max-followings` | No | Provide to backfill profiles for your most recent followings. Determines how many of your last followings you want to backfill. Recommended value: `80`.
| `max-followers` | No | Provide to backfill profiles for your most recent followers. Determines how many of your last followers you want to backfill. Recommended value: `80`.
| `max-follow-requests` | No | Provide to backfill profiles for the API key owner's most recent pending follow requests. Determines how many of your last follow requests you want to backfill. Recommended value: `80`.
| `from-notifications` | No | Provide to backfill profiles of anyone mentioned in your recent notifications. Determines how many hours of notifications you want to look at. Requires an access token with `read:notifications` scope. Recommended value: `1`, unless you run FediFetcher less than once per hour.
| `reply-interval-in-hours` | No | Provide to fetch remote replies to posts that have received replies from users on your own instance. Determines how far back in time we'll go to find posts that have received replies. You must be administrator on your instance to use this option, and this option is not supported on Pleroma / Akkoma and its forks. Recommend value: `0` (disabled). Requires an access token with `admin:read:accounts`.
|`backfill-with-context` | No | Set to `0` to disable fetching remote replies while backfilling profiles. This is enabled by default, but you can disable it, if it's too slow for you.
|`backfill-mentioned-users` | No | Set to `0` to disable backfilling any mentioned users when fetching the home timeline. This is enabled by default, but you can disable it, if it's too slow for you.
| `remember-users-for-hours` | No | How long between back-filling attempts for non-followed accounts? Defaults to `168`, i.e. one week.
| `remember-hosts-for-days` | No | How long should FediFetcher cache host info for? Defaults to `30`.
| `http-timeout` | No | The timeout for any HTTP requests to the Mastodon API in seconds. Defaults to `5`.
| `lock-hours` | No | Determines after how many hours a lock file should be discarded. Not relevant when running the script as GitHub Action, as concurrency is prevented using a different mechanism. Recommended value: `24`.
| `lock-file` | No | Location for the lock file. If not specified, will use `lock.lock` under the state directory. Not relevant when running the script as GitHub Action.
| `state-dir` | No | Directory storing persistent files, and the default location for lock file. Not relevant when running the script as GitHub Action.
| `on-start` | No | Optionally provide a callback URL that will be pinged when processing is starting. A query parameter `rid={uuid}` will automatically be appended to uniquely identify each execution. This can be used to monitor your script using a service such as healthchecks.io.
| `on-done` | No | Optionally provide a callback URL that will be called when processing is finished. A query parameter `rid={uuid}` will automatically be appended to uniquely identify each execution. This can be used to monitor your script using a service such as healthchecks.io.
| `on-fail` | No | Optionally provide a callback URL that will be called when processing has failed. A query parameter `rid={uuid}` will automatically be appended to uniquely identify each execution. This can be used to monitor your script using a service such as healthchecks.io.
### Multi User support
If you wish to [run FediFetcher for multiple users on your instance](https://blog.thms.uk/2023/04/muli-user-support-for-fedifetcher?utm_source=github), you can supply the `access-token` as an array, with different access tokens for different users. That will allow you to fetch replies and/or backfill profiles for multiple users on your account.
This is only supported when running FediFetcher as cron job, or container. Multi-user support is not available when running FediFetcher as GitHub Action.
### Required Access Token Scopes
- For all actions, your access token must include these scopes:
- `read:search`
- `read:statuses`
- `read:accounts`
- If you are supplying `reply-interval-in-hours` you must additionally enable this scope:
- `admin:read:accounts`
- If you are supplying `max-follow-requests` you must additionally enable this scope:
- `read:follows`
- If you are supplying `max-bookmarks` you must additionally enable this scope:
- `read:bookmarks`
- If you are supplying `max-favourites` you must additionally enable this scope:
- `read:favourites`
- If you are supplying `from-notifications` you must additionally enable this scope:
- `read:notifications`
## Acknowledgments
This script is mostly taken from [Abhinav Sarkar](https://notes.abhinavsarkar.net/2023/mastodon-context), with just some additions and alterations. Thank you Abhinav!
The original inspiration of this script, as well as parts of its implementation are taken from [Abhinav Sarkar](https://notes.abhinavsarkar.net/2023/mastodon-context). Thank you Abhinav!

View file

@ -0,0 +1,33 @@
# This script is a sample script that you can schedule
# to run every 10 minutes from your cron job.
# Supply any other arguments, as you see fit.
# In this script, FediFetcher will fetch remote replies for multiple
# users on your instance
# TOKEN1, TOKEN2, and TOKEN3 belong to 3 different users here.
# Sample schedule:
# */10 * * * * /usr/bin/bash /path/to/FediFetcher.sh
###################### IMPORTANT ######################
# #
# YOU SHOULD RUN THIS SCRIPT MANUALLY AT LEAST ONCE #
# WITH YOUR CHOSEN ARGUMENTS, TO AVOID CONCURRENT #
# EXECUTIONS OF FEDIFETCHER! #
# #
###################### IMPORTANT ######################
cd /path/to/FediFetcher
python3 find_posts.py \
--access-token=TOKEN1 \
--access-token=TOKEN2 \
--access-token=TOKEN3 \
--server=your.server.social \
--home-timeline-length=200 \
--max-followings=80 \
--from-notifications=1 \
--lock-hours=1

25
examples/FediFetcher.sh Normal file
View file

@ -0,0 +1,25 @@
# This script is a sample script that you can schedule
# to run every 10 minutes from your cron job.
# Supply any other arguments, as you see fit.
# Sample schedule:
# */10 * * * * /usr/bin/bash /path/to/FediFetcher.sh
###################### IMPORTANT ######################
# #
# YOU SHOULD RUN THIS SCRIPT MANUALLY AT LEAST ONCE #
# WITH YOUR CHOSEN ARGUMENTS, TO AVOID CONCURRENT #
# EXECUTIONS OF FEDIFETCHER! #
# #
###################### IMPORTANT ######################
cd /path/to/FediFetcher
python find_posts.py \
--access-token=TOKEN \
--server=your.server.social \
--home-timeline-length=200 \
--max-followings=80 \
--from-notifications=1 \
--lock-hours=1

View file

@ -0,0 +1,19 @@
name: fedifetcher
services:
fedifetcher:
stdin_open: true
tty: true
image: ghcr.io/nanos/fedifetcher:latest
command: "--access-token=<TOKEN> --server=<SERVER>"
# Persist our data
volumes:
- ./data:/app/artifacts
# Use the `deploy` option to enable `restart_policy`
deploy:
# Don't go above 1 replica to avoid multiple overlapping executions of the script
replicas: 1
restart_policy:
# The `any` condition means even after successful runs, we'll restart the script
condition: any
# Specify how often the script should run - for example; after 1 hour.
delay: 1h

47
examples/k8s-cronjob.yaml Normal file
View file

@ -0,0 +1,47 @@
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: fedifetcher-pvc
spec:
accessModes:
- ReadWriteOnce
volumeMode: Filesystem
resources:
requests:
storage: 100Mi
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: fedifetcher
spec:
# Run every 2 hours
schedule: "0 */2 * * *"
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 1
concurrencyPolicy: Forbid
jobTemplate:
spec:
template:
spec:
volumes:
- name: artifacts
persistentVolumeClaim:
claimName: fedifetcher-pvc
containers:
- name: fedifetcher
image: ghcr.io/nanos/fedifetcher:latest
args:
- --server=your.server.social
- --access-token=TOKEN
- --home-timeline-length
- "200"
- --max-followings
- "80"
- --from-notification
- "4"
volumeMounts:
- name: artifacts
mountPath: /app/artifacts
restartPolicy: Never

1481
find_posts.py Normal file

File diff suppressed because it is too large Load diff

View file

@ -1,501 +1,28 @@
#!/usr/bin/env python3
from datetime import datetime, timedelta
import itertools
import json
## This script is for legacy users only
## Please use find_posts.py instead
import os
import re
import sys
import requests
import time
def pull_context(
server,
access_token,
seen_urls,
replied_toot_server_ids,
reply_interval_hours,
max_home_timeline_length,
):
parsed_urls = {}
if reply_interval_hours > 0:
"""pull the context toots of toots user replied to, from their
original server, and add them to the local server."""
user_ids = get_active_user_ids(server, access_token, reply_interval_hours)
reply_toots = get_all_reply_toots(
server, user_ids, access_token, seen_urls, reply_interval_hours
)
known_context_urls = get_all_known_context_urls(server, reply_toots,parsed_urls)
seen_urls.update(known_context_urls)
replied_toot_ids = get_all_replied_toot_server_ids(
server, reply_toots, replied_toot_server_ids, parsed_urls
)
context_urls = get_all_context_urls(server, replied_toot_ids)
add_context_urls(server, access_token, context_urls, seen_urls)
if max_home_timeline_length > 0:
"""Do the same with any toots on the key owner's home timeline """
timeline_toots = get_timeline(server, access_token, max_home_timeline_length)
known_context_urls = get_all_known_context_urls(server, timeline_toots,parsed_urls)
add_context_urls(server, access_token, known_context_urls, seen_urls)
def get_timeline(server, access_token, max):
"""Get all post in the user's home timeline"""
url = f"https://{server}/api/v1/timelines/home"
try:
response = get_toots(url, access_token)
if response.status_code == 200:
toots = response.json()
elif response.status_code == 401:
raise Exception(
f"Error getting URL {url}. Status code: {response.status_code}. "
"Ensure your access token is correct"
)
elif response.status_code == 403:
raise Exception(
f"Error getting URL {url}. Status code: {response.status_code}. "
"Make sure you have the read:statuses scope enabled for your access token."
)
else:
raise Exception(
f"Error getting URL {url}. Status code: {response.status_code}"
)
# Paginate as needed
while len(toots) < max and 'next' in response.links:
response = get_toots(response.links['next']['url'], access_token)
toots = toots + response.json()
except Exception as ex:
print(f"Error getting timeline toots: {ex}")
sys.exit(1)
print(f"Found {len(toots)} toots in timeline")
return toots
def get_toots(url, access_token):
response = requests.get(
url, headers={"Authorization": f"Bearer {access_token}"}, timeout=5
)
if response.status_code == 200:
return response
elif response.status_code == 401:
raise Exception(
f"Error getting URL {url}. Status code: {response.status_code}. "
"It looks like your access token is incorrect."
)
elif response.status_code == 403:
raise Exception(
f"Error getting URL {url}. Status code: {response.status_code}. "
"Make sure you have the read:statuses scope enabled for your access token."
)
else:
raise Exception(
f"Error getting URL {url}. Status code: {response.status_code}"
)
def get_active_user_ids(server, access_token, reply_interval_hours):
"""get all user IDs on the server that have posted a toot in the given
time interval"""
since = datetime.now() - timedelta(days=reply_interval_hours / 24 + 1)
url = f"https://{server}/api/v1/admin/accounts"
resp = requests.get(
url, headers={"Authorization": f"Bearer {access_token}"}, timeout=5
)
if resp.status_code == 200:
for user in resp.json():
last_status_at = user["account"]["last_status_at"]
if last_status_at is not None:
last_active = datetime.strptime(last_status_at, "%Y-%m-%d")
if last_active > since:
print(f"Found active user: {user['username']}")
yield user["id"]
elif resp.status_code == 401:
raise Exception(
f"Error getting user IDs on server {server}. Status code: {resp.status_code}. "
"Ensure your access token is correct"
)
elif resp.status_code == 403:
raise Exception(
f"Error getting user IDs on server {server}. Status code: {resp.status_code}. "
"Make sure you have the admin:read:accounts scope enabled for your access token."
)
else:
raise Exception(
f"Error getting user IDs on server {server}. Status code: {resp.status_code}"
)
def get_all_reply_toots(
server, user_ids, access_token, seen_urls, reply_interval_hours
):
"""get all replies to other users by the given users in the last day"""
replies_since = datetime.now() - timedelta(hours=reply_interval_hours)
reply_toots = list(
itertools.chain.from_iterable(
get_reply_toots(
user_id, server, access_token, seen_urls, replies_since
)
for user_id in user_ids
)
)
print(f"Found {len(reply_toots)} reply toots")
return reply_toots
def get_reply_toots(user_id, server, access_token, seen_urls, reply_since):
"""get replies by the user to other users since the given date"""
url = f"https://{server}/api/v1/accounts/{user_id}/statuses?exclude_replies=false&limit=40"
try:
resp = requests.get(
url, headers={"Authorization": f"Bearer {access_token}"}, timeout=5
)
except Exception as ex:
print(
f"Error getting replies for user {user_id} on server {server}: {ex}"
)
return []
if resp.status_code == 200:
toots = [
toot
for toot in resp.json()
if toot["in_reply_to_id"] is not None
and toot["url"] not in seen_urls
and datetime.strptime(toot["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
> reply_since
]
for toot in toots:
print(f"Found reply toot: {toot['url']}")
return toots
elif resp.status_code == 403:
raise Exception(
f"Error getting replies for user {user_id} on server {server}. Status code: {resp.status_code}. "
"Make sure you have the read:statuses scope enabled for your access token."
)
raise Exception(
f"Error getting replies for user {user_id} on server {server}. Status code: {resp.status_code}"
)
def get_all_known_context_urls(server, reply_toots,parsed_urls):
"""get the context toots of the given toots from their original server"""
known_context_urls = set(
filter(
lambda url: not url.startswith(f"https://{server}/"),
itertools.chain.from_iterable(
get_toot_context(*parse_url(toot["url"] if toot["reblog"] is None else toot["reblog"]["url"],parsed_urls), toot["url"])
for toot in filter(
lambda toot: toot_has_parseable_url(toot,parsed_urls),
reply_toots
)
),
)
)
print(f"Found {len(known_context_urls)} known context toots")
return known_context_urls
def toot_has_parseable_url(toot,parsed_urls):
parsed = parse_url(toot["url"] if toot["reblog"] is None else toot["reblog"]["url"],parsed_urls)
if(parsed is None) :
return False
return True
def get_all_replied_toot_server_ids(
server, reply_toots, replied_toot_server_ids, parsed_urls
):
"""get the server and ID of the toots the given toots replied to"""
return filter(
lambda x: x is not None,
(
get_replied_toot_server_id(server, toot, replied_toot_server_ids, parsed_urls)
for toot in reply_toots
),
)
def get_replied_toot_server_id(server, toot, replied_toot_server_ids,parsed_urls):
"""get the server and ID of the toot the given toot replied to"""
in_reply_to_id = toot["in_reply_to_id"]
in_reply_to_account_id = toot["in_reply_to_account_id"]
mentions = toot["mentions"]
if len(mentions) == 0:
return None
mention = [
mention
for mention in mentions
if mention["id"] == in_reply_to_account_id
][0]
o_url = f"https://{server}/@{mention['acct']}/{in_reply_to_id}"
if o_url in replied_toot_server_ids:
return replied_toot_server_ids[o_url]
url = get_redirect_url(o_url)
if url is None:
return None
match = parse_url(url,parsed_urls)
if match is not None:
replied_toot_server_ids[o_url] = (url, match)
return (url, match)
print(f"Error parsing toot URL {url}")
replied_toot_server_ids[o_url] = None
return None
def parse_url(url, parsed_urls):
if url not in parsed_urls:
match = parse_mastodon_url(url)
if match is not None:
parsed_urls[url] = match
if url not in parsed_urls:
match = parse_pleroma_url(url)
if match is not None:
parsed_urls[url] = match
if url not in parsed_urls:
print(f"Error parsing toot URL {url}")
parsed_urls[url] = None
return parsed_urls[url]
def parse_mastodon_url(url):
"""parse a Mastodon URL and return the server and ID"""
match = re.match(
r"https://(?P<server>.*)/@(?P<username>.*)/(?P<toot_id>.*)", url
)
if match is not None:
return (match.group("server"), match.group("toot_id"))
return None
def parse_pleroma_url(url):
"""parse a Pleroma URL and return the server and ID"""
match = re.match(r"https://(?P<server>.*)/objects/(?P<toot_id>.*)", url)
if match is not None:
server = match.group("server")
url = get_redirect_url(url)
if url is None:
return None
match = re.match(r"/notice/(?P<toot_id>.*)", url)
if match is not None:
return (server, match.group("toot_id"))
return None
return None
def get_redirect_url(url):
"""get the URL given URL redirects to"""
try:
resp = requests.head(url, allow_redirects=False, timeout=5)
except Exception as ex:
print(f"Error getting redirect URL for URL {url}. Exception: {ex}")
return None
if resp.status_code == 200:
return url
elif resp.status_code == 302:
redirect_url = resp.headers["Location"]
print(f"Discovered redirect for URL {url}")
return redirect_url
else:
print(
f"Error getting redirect URL for URL {url}. Status code: {resp.status_code}"
)
return None
def get_all_context_urls(server, replied_toot_ids):
"""get the URLs of the context toots of the given toots"""
return filter(
lambda url: not url.startswith(f"https://{server}/"),
itertools.chain.from_iterable(
get_toot_context(server, toot_id, url)
for (url, (server, toot_id)) in replied_toot_ids
),
)
def get_toot_context(server, toot_id, toot_url):
"""get the URLs of the context toots of the given toot"""
url = f"https://{server}/api/v1/statuses/{toot_id}/context"
try:
resp = requests.get(url, timeout=5)
except Exception as ex:
print(f"Error getting context for toot {toot_url}. Exception: {ex}")
return []
if resp.status_code == 200:
try:
res = resp.json()
print(f"Got context for toot {toot_url}")
return (toot["url"] for toot in (res["ancestors"] + res["descendants"]))
except Exception as ex:
print(f"Error parsing context for toot {toot_url}. Exception: {ex}")
return []
elif resp.status_code == 429:
reset = datetime.strptime(resp.headers['x-ratelimit-reset'], '%Y-%m-%dT%H:%M:%S.%fZ')
print(f"Rate Limit hit when getting context for {toot_url}. Waiting to retry at {resp.headers['x-ratelimit-reset']}")
time.sleep((reset - datetime.now()).total_seconds() + 1)
return get_toot_context(server, toot_id, toot_url)
print(
f"Error getting context for toot {toot_url}. Status code: {resp.status_code}"
)
return []
def add_context_urls(server, access_token, context_urls, seen_urls):
"""add the given toot URLs to the server"""
count = 0
failed = 0
for url in context_urls:
if url not in seen_urls:
added = add_context_url(url, server, access_token)
if added is True:
seen_urls.add(url)
count += 1
else:
failed += 1
print(f"Added {count} new context toots (with {failed} failures)")
def add_context_url(url, server, access_token):
"""add the given toot URL to the server"""
search_url = f"https://{server}/api/v2/search?q={url}&resolve=true&limit=1"
try:
resp = requests.get(
search_url,
headers={"Authorization": f"Bearer {access_token}"},
timeout=5,
)
except Exception as ex:
print(
f"Error adding url {search_url} to server {server}. Exception: {ex}"
)
return False
if resp.status_code == 200:
print(f"Added context url {url}")
return True
elif resp.status_code == 403:
print(
f"Error adding url {search_url} to server {server}. Status code: {resp.status_code}. "
"Make sure you have the read:search scope enabled for your access token."
)
return False
elif resp.status_code == 429:
reset = datetime.strptime(resp.headers['x-ratelimit-reset'], '%Y-%m-%dT%H:%M:%S.%fZ')
print(f"Rate Limit hit when adding url {search_url}. Waiting to retry at {resp.headers['x-ratelimit-reset']}")
time.sleep((reset - datetime.now()).total_seconds() + 1)
return add_context_url(url, server, access_token)
else:
print(
f"Error adding url {search_url} to server {server}. Status code: {resp.status_code}"
)
return False
class OrderedSet:
"""An ordered set implementation over a dict"""
def __init__(self, iterable):
self._dict = {}
for item in iterable:
self.add(item)
def add(self, item):
if item not in self._dict:
self._dict[item] = None
def update(self, iterable):
for item in iterable:
self.add(item)
def __contains__(self, item):
return item in self._dict
def __iter__(self):
return iter(self._dict)
def __len__(self):
return len(self._dict)
if __name__ == "__main__":
HELP_MESSAGE = """
Usage: python3 pull_context.py <access_token> <server> <reply_interval_in_hours> <home_timeline_length>
- <access_token>: The access token can be generated at https://<server>/settings/applications,
and must have read:search, read:statuses and admin:read:accounts scopes.
- <server>: The name of your server (e.g. `mstdn.thms.uk`)
- <reply_interval_in_hours>: Only look at posts that have received replies in this period
- <home_timeline_length>: Also look for replies to posts in the API-Key owner's home timeline, up to
this many posts
"""
if len(sys.argv) < 5:
print(HELP_MESSAGE)
sys.exit(1)
ACCESS_TOKEN = sys.argv[1]
SERVER = sys.argv[2]
REPLY_INTERVAL_IN_HOURS = int(sys.argv[3])
MAX_HOME_TIMELINE_LENGTH = int(sys.argv[4])
print(
f"Getting last {REPLY_INTERVAL_IN_HOURS} hrs of replies, and latest {MAX_HOME_TIMELINE_LENGTH} posts in home timeline from {SERVER}"
)
SEEN_URLS_FILE = "artifacts/seen_urls"
REPLIED_TOOT_SERVER_IDS_FILE = "artifacts/replied_toot_server_ids"
SEEN_URLS = OrderedSet([])
if os.path.exists(SEEN_URLS_FILE):
with open(SEEN_URLS_FILE, "r", encoding="utf-8") as f:
SEEN_URLS = OrderedSet(f.read().splitlines())
REPLIED_TOOT_SERVER_IDS = {}
if os.path.exists(REPLIED_TOOT_SERVER_IDS_FILE):
with open(REPLIED_TOOT_SERVER_IDS_FILE, "r", encoding="utf-8") as f:
REPLIED_TOOT_SERVER_IDS = json.load(f)
pull_context(
SERVER,
ACCESS_TOKEN,
SEEN_URLS,
REPLIED_TOOT_SERVER_IDS,
REPLY_INTERVAL_IN_HOURS,
MAX_HOME_TIMELINE_LENGTH,
)
with open(SEEN_URLS_FILE, "w", encoding="utf-8") as f:
f.write("\n".join(list(SEEN_URLS)[:10000]))
with open(REPLIED_TOOT_SERVER_IDS_FILE, "w", encoding="utf-8") as f:
json.dump(dict(list(REPLIED_TOOT_SERVER_IDS.items())[:10000]), f)
ACCESS_TOKEN = sys.argv[1]
SERVER = sys.argv[2]
REPLY_INTERVAL_IN_HOURS = int(sys.argv[3])
MAX_HOME_TIMELINE_LENGTH = int(sys.argv[4])
if len(sys.argv) > 5:
MAX_FOLLOWINGS = int(sys.argv[5])
else:
MAX_FOLLOWINGS = 0
if len(sys.argv) > 6:
BACKFILL_FOLLOWINGS_FOR_USER = sys.argv[6]
else:
BACKFILL_FOLLOWINGS_FOR_USER = ''
if len(sys.argv) > 7:
MAX_FOLLOWERS = int(sys.argv[7])
else:
MAX_FOLLOWERS = 0
os.system(f"python find_posts.py --server={SERVER} --access-token={ACCESS_TOKEN} --reply-interval-in-hours={REPLY_INTERVAL_IN_HOURS} --home-timeline-length={MAX_HOME_TIMELINE_LENGTH} --user={BACKFILL_FOLLOWINGS_FOR_USER} --max-followings={MAX_FOLLOWINGS} --max-followers={MAX_FOLLOWERS}")

View file

@ -2,6 +2,9 @@ certifi==2022.12.7
charset-normalizer==3.0.1
docutils==0.19
idna==3.4
python-dateutil==2.8.2
requests==2.28.2
six==1.16.0
smmap==5.0.0
urllib3==1.26.14
defusedxml==0.7.1