add recursive search support
This commit is contained in:
parent
4f394e3925
commit
33b5ac9390
2 changed files with 250 additions and 12 deletions
138
scraper-nodepth.go
Normal file
138
scraper-nodepth.go
Normal file
|
@ -0,0 +1,138 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"regexp"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
"github.com/go-rod/rod"
|
||||||
|
)
|
||||||
|
|
||||||
|
var visitedURLs = sync.Map{}
|
||||||
|
var allLinks = sync.Map{}
|
||||||
|
var mutex = &sync.Mutex{}
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
fmt.Println("Starte das Programm...")
|
||||||
|
startTime := time.Now()
|
||||||
|
|
||||||
|
browser := rod.New().MustConnect()
|
||||||
|
defer browser.MustClose()
|
||||||
|
|
||||||
|
initialURL := "https://pad.stratum0.org/p/dc"
|
||||||
|
fmt.Printf("Beginne mit der initialen URL: %s\n", initialURL)
|
||||||
|
|
||||||
|
wg.Add(1)
|
||||||
|
go extractLinksFromPage(browser, initialURL)
|
||||||
|
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
fmt.Println("\nAlle gefundenen Links:")
|
||||||
|
linkCount := 0
|
||||||
|
allLinks.Range(func(key, value interface{}) bool {
|
||||||
|
fmt.Println(key)
|
||||||
|
linkCount++
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
|
||||||
|
fmt.Printf("\nStatistik:\n")
|
||||||
|
fmt.Printf("Gesamtanzahl der gefundenen Links: %d\n", linkCount)
|
||||||
|
visitedCount := 0
|
||||||
|
visitedURLs.Range(func(key, value interface{}) bool {
|
||||||
|
visitedCount++
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
fmt.Printf("Anzahl der besuchten URLs: %d\n", visitedCount)
|
||||||
|
fmt.Printf("Gesamtzeit: %v\n", time.Since(startTime))
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractLinksFromPage(browser *rod.Browser, url string) {
|
||||||
|
defer wg.Done()
|
||||||
|
|
||||||
|
mutex.Lock()
|
||||||
|
fmt.Printf("\nVerarbeite URL: %s\n", url)
|
||||||
|
mutex.Unlock()
|
||||||
|
|
||||||
|
if _, visited := visitedURLs.LoadOrStore(url, true); visited {
|
||||||
|
mutex.Lock()
|
||||||
|
fmt.Printf("URL bereits besucht: %s\n", url)
|
||||||
|
mutex.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
page := browser.MustPage(url)
|
||||||
|
defer page.MustClose()
|
||||||
|
page.MustWaitLoad()
|
||||||
|
|
||||||
|
mutex.Lock()
|
||||||
|
fmt.Printf("Seite geladen: %s\n", url)
|
||||||
|
mutex.Unlock()
|
||||||
|
|
||||||
|
// Verarbeite die Hauptseite
|
||||||
|
processPage(page, url)
|
||||||
|
|
||||||
|
// Verarbeite die verschachtelten iFrames
|
||||||
|
processNestedIframes(page, url)
|
||||||
|
}
|
||||||
|
|
||||||
|
func processNestedIframes(page *rod.Page, sourceURL string) {
|
||||||
|
mutex.Lock()
|
||||||
|
fmt.Printf("Suche nach äußerem iFrame auf %s\n", sourceURL)
|
||||||
|
mutex.Unlock()
|
||||||
|
|
||||||
|
// Finden Sie das erste iFrame-Element
|
||||||
|
outerIframeElement := page.MustElement("#editorcontainer > iframe:nth-child(1)")
|
||||||
|
|
||||||
|
// Wechseln Sie zum Kontext des ersten iFrames
|
||||||
|
outerFrame := outerIframeElement.MustFrame()
|
||||||
|
|
||||||
|
// Warten Sie, bis der Inhalt des ersten iFrames geladen ist
|
||||||
|
outerFrame.MustWaitLoad()
|
||||||
|
|
||||||
|
mutex.Lock()
|
||||||
|
fmt.Printf("Äußeres iFrame geladen auf %s\n", sourceURL)
|
||||||
|
mutex.Unlock()
|
||||||
|
|
||||||
|
// Verarbeite das äußere iFrame
|
||||||
|
processPage(outerFrame, sourceURL+" (äußeres iFrame)")
|
||||||
|
|
||||||
|
mutex.Lock()
|
||||||
|
fmt.Printf("Suche nach innerem iFrame auf %s\n", sourceURL)
|
||||||
|
mutex.Unlock()
|
||||||
|
|
||||||
|
// Finden Sie das zweite iFrame-Element innerhalb des ersten iFrames
|
||||||
|
innerIframeElement := outerFrame.MustElement("#outerdocbody > iframe:nth-child(1)")
|
||||||
|
|
||||||
|
// Wechseln Sie zum Kontext des zweiten iFrames
|
||||||
|
innerFrame := innerIframeElement.MustFrame()
|
||||||
|
innerFrame.MustWaitLoad()
|
||||||
|
|
||||||
|
mutex.Lock()
|
||||||
|
fmt.Printf("Inneres iFrame geladen auf %s\n", sourceURL)
|
||||||
|
mutex.Unlock()
|
||||||
|
|
||||||
|
// Verarbeite das innere iFrame
|
||||||
|
processPage(innerFrame, sourceURL+" (inneres iFrame)")
|
||||||
|
}
|
||||||
|
|
||||||
|
func processPage(page *rod.Page, sourceURL string) {
|
||||||
|
text := page.MustElement("body").MustText()
|
||||||
|
|
||||||
|
re := regexp.MustCompile(`https://pad\.stratum0\.org/p/dc[^\s"']+`)
|
||||||
|
links := re.FindAllString(text, -1)
|
||||||
|
|
||||||
|
mutex.Lock()
|
||||||
|
fmt.Printf("Gefundene Links auf %s: %d\n", sourceURL, len(links))
|
||||||
|
mutex.Unlock()
|
||||||
|
|
||||||
|
for _, link := range links {
|
||||||
|
if _, exists := allLinks.LoadOrStore(link, true); !exists {
|
||||||
|
mutex.Lock()
|
||||||
|
fmt.Printf("Neuer Link gefunden: %s\n", link)
|
||||||
|
mutex.Unlock()
|
||||||
|
wg.Add(1)
|
||||||
|
go extractLinksFromPage(page.Browser(), link)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
124
scraper.go
124
scraper.go
|
@ -3,18 +3,96 @@ package main
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"time"
|
||||||
"github.com/go-rod/rod"
|
"github.com/go-rod/rod"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
visitedURLs = make(map[string]bool)
|
||||||
|
allLinks = make(map[string]bool)
|
||||||
|
maxDepth = 3 // Hier können Sie die maximale Tiefe festlegen
|
||||||
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
fmt.Println("Starte das Programm...")
|
||||||
|
startTime := time.Now()
|
||||||
|
|
||||||
browser := rod.New().MustConnect()
|
browser := rod.New().MustConnect()
|
||||||
defer browser.MustClose()
|
defer browser.MustClose()
|
||||||
|
|
||||||
page := browser.MustPage("https://pad.stratum0.org/p/dc") // URL der Hauptseite
|
initialURL := "https://pad.stratum0.org/p/dc"
|
||||||
|
fmt.Printf("Beginne mit der initialen URL: %s\n", initialURL)
|
||||||
|
fmt.Printf("Maximale Tiefe: %d\n", maxDepth)
|
||||||
|
|
||||||
|
toVisit := []struct {
|
||||||
|
url string
|
||||||
|
depth int
|
||||||
|
}{{initialURL, 0}}
|
||||||
|
|
||||||
|
for len(toVisit) > 0 {
|
||||||
|
current := toVisit[0]
|
||||||
|
toVisit = toVisit[1:]
|
||||||
|
|
||||||
|
newLinks := extractLinksFromPage(browser, current.url, current.depth)
|
||||||
|
|
||||||
|
for _, link := range newLinks {
|
||||||
|
if !visitedURLs[link] && current.depth < maxDepth {
|
||||||
|
toVisit = append(toVisit, struct {
|
||||||
|
url string
|
||||||
|
depth int
|
||||||
|
}{link, current.depth + 1})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Println("\nAlle gefundenen Links:")
|
||||||
|
for link := range allLinks {
|
||||||
|
fmt.Println(link)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("\nStatistik:\n")
|
||||||
|
fmt.Printf("Gesamtanzahl der gefundenen Links: %d\n", len(allLinks))
|
||||||
|
fmt.Printf("Anzahl der besuchten URLs: %d\n", len(visitedURLs))
|
||||||
|
fmt.Printf("Gesamtzeit: %v\n", time.Since(startTime))
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractLinksFromPage(browser *rod.Browser, url string, depth int) []string {
|
||||||
|
fmt.Printf("\nVerarbeite URL: %s (Tiefe: %d)\n", url, depth)
|
||||||
|
|
||||||
|
if depth > maxDepth {
|
||||||
|
fmt.Printf("Maximale Tiefe erreicht für URL: %s\n", url)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if visitedURLs[url] {
|
||||||
|
fmt.Printf("URL bereits besucht: %s\n", url)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
visitedURLs[url] = true
|
||||||
|
|
||||||
|
page := browser.MustPage(url)
|
||||||
|
defer page.MustClose()
|
||||||
page.MustWaitLoad()
|
page.MustWaitLoad()
|
||||||
|
|
||||||
|
fmt.Printf("Seite geladen: %s\n", url)
|
||||||
|
|
||||||
|
var newLinks []string
|
||||||
|
|
||||||
|
// Verarbeite die Hauptseite
|
||||||
|
newLinks = append(newLinks, extractLinks(page, url)...)
|
||||||
|
|
||||||
|
// Verarbeite die verschachtelten iFrames
|
||||||
|
newLinks = append(newLinks, processNestedIframes(page, url)...)
|
||||||
|
|
||||||
|
return newLinks
|
||||||
|
}
|
||||||
|
|
||||||
|
func processNestedIframes(page *rod.Page, sourceURL string) []string {
|
||||||
|
fmt.Printf("Suche nach äußerem iFrame auf %s\n", sourceURL)
|
||||||
|
|
||||||
// Finden Sie das erste iFrame-Element
|
// Finden Sie das erste iFrame-Element
|
||||||
outerIframeElement := page.MustElement("#editorcontainer > iframe:nth-child(1)") // Passen Sie den Selektor an
|
outerIframeElement := page.MustElement("#editorcontainer > iframe:nth-child(1)")
|
||||||
|
|
||||||
// Wechseln Sie zum Kontext des ersten iFrames
|
// Wechseln Sie zum Kontext des ersten iFrames
|
||||||
outerFrame := outerIframeElement.MustFrame()
|
outerFrame := outerIframeElement.MustFrame()
|
||||||
|
@ -22,22 +100,44 @@ func main() {
|
||||||
// Warten Sie, bis der Inhalt des ersten iFrames geladen ist
|
// Warten Sie, bis der Inhalt des ersten iFrames geladen ist
|
||||||
outerFrame.MustWaitLoad()
|
outerFrame.MustWaitLoad()
|
||||||
|
|
||||||
|
fmt.Printf("Äußeres iFrame geladen auf %s\n", sourceURL)
|
||||||
|
|
||||||
|
// Extrahiere Links aus dem äußeren iFrame
|
||||||
|
outerLinks := extractLinks(outerFrame, sourceURL+" (äußeres iFrame)")
|
||||||
|
|
||||||
|
fmt.Printf("Suche nach innerem iFrame auf %s\n", sourceURL)
|
||||||
|
|
||||||
// Finden Sie das zweite iFrame-Element innerhalb des ersten iFrames
|
// Finden Sie das zweite iFrame-Element innerhalb des ersten iFrames
|
||||||
innerIframeElement := outerFrame.MustElement("#outerdocbody > iframe:nth-child(1)") // Passen Sie den Selektor an
|
innerIframeElement := outerFrame.MustElement("#outerdocbody > iframe:nth-child(1)")
|
||||||
|
|
||||||
// Wechseln Sie zum Kontext des zweiten iFrames
|
// Wechseln Sie zum Kontext des zweiten iFrames
|
||||||
innerFrame := innerIframeElement.MustFrame()
|
innerFrame := innerIframeElement.MustFrame()
|
||||||
innerFrame.MustWaitLoad()
|
innerFrame.MustWaitLoad()
|
||||||
|
|
||||||
// Extrahieren Sie den Text aus dem zweiten iFrame (Benutze statt #innderdocbody auch body für jedes Element)
|
fmt.Printf("Inneres iFrame geladen auf %s\n", sourceURL)
|
||||||
text := innerFrame.MustElement("#innerdocbody").MustText()
|
|
||||||
|
|
||||||
// Regex zum Finden von Links, die mit https beginnen
|
// Extrahiere Links aus dem inneren iFrame
|
||||||
re := regexp.MustCompile(`https://[\w\.-]+(?:/[\w\.-]*)*`)
|
innerLinks := extractLinks(innerFrame, sourceURL+" (inneres iFrame)")
|
||||||
httpsLinks := re.FindAllString(text, -1)
|
|
||||||
|
|
||||||
fmt.Println("Gefundene https-Links:")
|
return append(outerLinks, innerLinks...)
|
||||||
for _, link := range httpsLinks {
|
}
|
||||||
fmt.Println(link)
|
|
||||||
}
|
func extractLinks(page *rod.Page, sourceURL string) []string {
|
||||||
|
text := page.MustElement("body").MustText()
|
||||||
|
|
||||||
|
re := regexp.MustCompile(`https://pad\.stratum0\.org/p/dc[^\s"']+`)
|
||||||
|
links := re.FindAllString(text, -1)
|
||||||
|
|
||||||
|
fmt.Printf("Gefundene Links auf %s: %d\n", sourceURL, len(links))
|
||||||
|
|
||||||
|
var newLinks []string
|
||||||
|
for _, link := range links {
|
||||||
|
if !allLinks[link] {
|
||||||
|
allLinks[link] = true
|
||||||
|
fmt.Printf("Neuer Link gefunden: %s\n", link)
|
||||||
|
newLinks = append(newLinks, link)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return newLinks
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue