add download script for pad download

2024-09-21 20:28:37 +02:00 · 2024-09-21 20:28:37 +02:00 · 51e9239b9d
commit 51e9239b9d
parent bfa96ac46d
2 changed files with 45 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -87,3 +87,8 @@ git reset --hard 8& git clean -dfx
 pacman -S qemu-user-static qemu-user-static-binfmt
 https://wiki.archlinux.org/title/QEMU#Chrooting_into_arm/arm64_environment_from_x86_64
 ~~~
 ### Side Crowling on a etherpad-lite pad
 ~~~
 cat urls | xargs -I+ wget --recursive '+/export/markdown' -erobots=off ; grep -r -o -E --no-filename 'pad.stratum0.org/p/dc[a-zA-Z0-9-]*' pad.stratum0.org >> urls; sort -u -o urls urls
 ~~~
--- a/crawl_url.sh
+++ b/crawl_url.sh
@ -0,0 +1,39 @@
 #!/bin/bash
 base_url="https://pad.stratum0.org/p/dc/export/txt"
 depth=${1:-3}  # Suchtiefe als Parameter, Standard ist 3
 user_agent="Mozilla/5.0 (compatible; EtherpadCrawler/1.0)"
 delay=0  # Verzögerung zwischen Anfragen in Sekunden
 # Funktion zum Crawlen einer URL
 crawl_url() {
    local url=$1
    local current_depth=$2
    if [ $current_depth -gt $depth ]; then
        return
    fi
    echo "Durchsuche: $url"
    # Verwende curl mit Fehlerbehandlung und User-Agent
    response=$(curl -s -A "$user_agent" -m 30 "$url")
    if [ $? -ne 0 ]; then
        echo "Fehler beim Abrufen von $url"
        return
    fi
    # Extrahiere Links effizienter
    links=$(echo "$response" | grep -oP '(href="|)https?://pad\.stratum0\.org/p/dc-[^"\s]+' | sed 's/^href="//g')
    for link in $links; do
        echo "Gefunden: $link"
        # Verzögerung zwischen Anfragen
        sleep $delay
        crawl_url "$link" $((current_depth + 1)) &
    done
 }
 # Starte den Crawl-Prozess
 crawl_url "$base_url" 1
 wait  # Warte auf alle Hintergrundprozesse