universal-link-extractor-et.../scraper.go

169 lines
4.3 KiB
Go
Raw Permalink Normal View History

2024-09-21 21:03:48 +02:00
package main
import (
"flag"
2024-09-21 21:03:48 +02:00
"fmt"
"log"
"os"
2024-09-21 21:03:48 +02:00
"regexp"
2024-09-21 21:36:06 +02:00
"time"
"io"
2024-09-21 21:03:48 +02:00
"github.com/go-rod/rod"
2024-09-21 23:32:04 +02:00
"github.com/schollz/progressbar/v3"
2024-09-21 21:03:48 +02:00
)
2024-09-21 21:36:06 +02:00
var (
visitedURLs = make(map[string]bool)
allLinks = make(map[string]bool)
maxDepth = 3
initialURLs = []string{"https://pad.stratum0.org/p/dc"}
2024-09-21 23:32:04 +02:00
linkRegexPattern = `https://pad\.stratum0\.org/p/dc[^\s"']+`
debugMode bool
logger *log.Logger
2024-09-21 21:36:06 +02:00
)
func init() {
flag.BoolVar(&debugMode, "debug", false, "Enable debug mode")
flag.Parse()
if debugMode {
logger = log.New(os.Stdout, "DEBUG: ", log.Ldate|log.Ltime|log.Lshortfile)
} else {
logger = log.New(io.Discard, "", 0)
}
}
2024-09-21 21:03:48 +02:00
func main() {
2024-09-21 21:36:06 +02:00
startTime := time.Now()
logger.Println("Starte das Programm...")
2024-09-21 21:03:48 +02:00
browser := rod.New().MustConnect()
defer browser.MustClose()
logger.Printf("Initiale URLs: %v\n", initialURLs)
logger.Printf("Link Regex Pattern: %s\n", linkRegexPattern)
logger.Printf("Maximale Tiefe: %d\n", maxDepth)
2024-09-21 21:36:06 +02:00
2024-09-21 23:32:04 +02:00
toVisit := make([]struct {
2024-09-21 21:36:06 +02:00
url string
depth int
2024-09-21 23:32:04 +02:00
}, len(initialURLs))
for i, url := range initialURLs {
toVisit[i] = struct {
url string
depth int
}{url, 0}
}
totalURLs := len(toVisit)
bar := progressbar.Default(int64(totalURLs))
2024-09-21 21:36:06 +02:00
for len(toVisit) > 0 {
current := toVisit[0]
toVisit = toVisit[1:]
newLinks := extractLinksFromPage(browser, current.url, current.depth)
for _, link := range newLinks {
if !visitedURLs[link] && current.depth < maxDepth {
toVisit = append(toVisit, struct {
url string
depth int
}{link, current.depth + 1})
2024-09-21 23:32:04 +02:00
totalURLs++
bar.ChangeMax(totalURLs)
2024-09-21 21:36:06 +02:00
}
}
2024-09-21 23:32:04 +02:00
bar.Add(1)
2024-09-21 21:36:06 +02:00
}
fmt.Println("\nAlle gefundenen Links:")
for link := range allLinks {
fmt.Println(link)
}
fmt.Printf("\nStatistik:\n")
fmt.Printf("Gesamtanzahl der gefundenen Links: %d\n", len(allLinks))
fmt.Printf("Anzahl der besuchten URLs: %d\n", len(visitedURLs))
fmt.Printf("Gesamtzeit: %v\n", time.Since(startTime))
}
func extractLinksFromPage(browser *rod.Browser, url string, depth int) []string {
logger.Printf("\nVerarbeite URL: %s (Tiefe: %d)\n", url, depth)
2024-09-21 21:36:06 +02:00
if depth > maxDepth {
logger.Printf("Maximale Tiefe erreicht für URL: %s\n", url)
2024-09-21 21:36:06 +02:00
return nil
}
if visitedURLs[url] {
logger.Printf("URL bereits besucht: %s\n", url)
2024-09-21 21:36:06 +02:00
return nil
}
visitedURLs[url] = true
page := browser.MustPage(url)
defer page.MustClose()
2024-09-21 21:03:48 +02:00
page.MustWaitLoad()
logger.Printf("Seite geladen: %s\n", url)
2024-09-21 23:32:04 +02:00
2024-09-21 21:36:06 +02:00
var newLinks []string
2024-09-21 23:32:04 +02:00
mainLinks := extractLinks(page, url)
newLinks = append(newLinks, mainLinks...)
2024-09-21 21:36:06 +02:00
2024-09-21 23:32:04 +02:00
iframeLinks := processNestedIframes(page, url)
newLinks = append(newLinks, iframeLinks...)
2024-09-21 21:36:06 +02:00
return newLinks
}
func processNestedIframes(page *rod.Page, sourceURL string) []string {
logger.Printf("Suche nach äußerem iFrame auf %s\n", sourceURL)
2024-09-21 21:36:06 +02:00
outerIframeElement := page.MustElement("#editorcontainer > iframe:nth-child(1)")
2024-09-21 21:03:48 +02:00
outerFrame := outerIframeElement.MustFrame()
outerFrame.MustWaitLoad()
logger.Printf("Äußeres iFrame geladen auf %s\n", sourceURL)
2024-09-21 21:36:06 +02:00
outerLinks := extractLinks(outerFrame, sourceURL+" (äußeres iFrame)")
logger.Printf("Suche nach innerem iFrame auf %s\n", sourceURL)
2024-09-21 21:36:06 +02:00
innerIframeElement := outerFrame.MustElement("#outerdocbody > iframe:nth-child(1)")
2024-09-21 21:03:48 +02:00
innerFrame := innerIframeElement.MustFrame()
innerFrame.MustWaitLoad()
logger.Printf("Inneres iFrame geladen auf %s\n", sourceURL)
2024-09-21 21:03:48 +02:00
2024-09-21 21:36:06 +02:00
innerLinks := extractLinks(innerFrame, sourceURL+" (inneres iFrame)")
2024-09-21 21:03:48 +02:00
2024-09-21 21:36:06 +02:00
return append(outerLinks, innerLinks...)
}
func extractLinks(page *rod.Page, sourceURL string) []string {
text := page.MustElement("body").MustText()
2024-09-21 23:32:04 +02:00
re := regexp.MustCompile(linkRegexPattern)
2024-09-21 21:36:06 +02:00
links := re.FindAllString(text, -1)
logger.Printf("Gefundene Links auf %s: %d\n", sourceURL, len(links))
2024-09-21 21:36:06 +02:00
var newLinks []string
for _, link := range links {
if !allLinks[link] {
allLinks[link] = true
logger.Printf("Neuer Link gefunden: %s\n", link)
2024-09-21 21:36:06 +02:00
newLinks = append(newLinks, link)
}
2024-09-21 21:03:48 +02:00
}
2024-09-21 21:36:06 +02:00
return newLinks
2024-09-21 21:03:48 +02:00
}