make better progress bar and add debug mode with -debug
This commit is contained in:
parent
66691780ea
commit
d09f46848c
1 changed files with 34 additions and 22 deletions
56
scraper.go
56
scraper.go
|
@ -1,9 +1,14 @@
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
"regexp"
|
"regexp"
|
||||||
"time"
|
"time"
|
||||||
|
"io"
|
||||||
|
|
||||||
"github.com/go-rod/rod"
|
"github.com/go-rod/rod"
|
||||||
"github.com/schollz/progressbar/v3"
|
"github.com/schollz/progressbar/v3"
|
||||||
)
|
)
|
||||||
|
@ -11,23 +16,34 @@ import (
|
||||||
var (
|
var (
|
||||||
visitedURLs = make(map[string]bool)
|
visitedURLs = make(map[string]bool)
|
||||||
allLinks = make(map[string]bool)
|
allLinks = make(map[string]bool)
|
||||||
maxDepth = 3 // Hier können Sie die maximale Tiefe festlegen
|
maxDepth = 3
|
||||||
|
initialURLs = []string{"https://pad.stratum0.org/p/dc"}
|
||||||
// Neue Variablen für initiale URLs und Regex-Pattern
|
|
||||||
initialURLs = []string{"https://pad.stratum0.org/p/dc"}
|
|
||||||
linkRegexPattern = `https://pad\.stratum0\.org/p/dc[^\s"']+`
|
linkRegexPattern = `https://pad\.stratum0\.org/p/dc[^\s"']+`
|
||||||
|
debugMode bool
|
||||||
|
logger *log.Logger
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
flag.BoolVar(&debugMode, "debug", false, "Enable debug mode")
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
if debugMode {
|
||||||
|
logger = log.New(os.Stdout, "DEBUG: ", log.Ldate|log.Ltime|log.Lshortfile)
|
||||||
|
} else {
|
||||||
|
logger = log.New(io.Discard, "", 0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
fmt.Println("Starte das Programm...")
|
|
||||||
startTime := time.Now()
|
startTime := time.Now()
|
||||||
|
|
||||||
|
logger.Println("Starte das Programm...")
|
||||||
browser := rod.New().MustConnect()
|
browser := rod.New().MustConnect()
|
||||||
defer browser.MustClose()
|
defer browser.MustClose()
|
||||||
|
|
||||||
fmt.Printf("Initiale URLs: %v\n", initialURLs)
|
logger.Printf("Initiale URLs: %v\n", initialURLs)
|
||||||
fmt.Printf("Link Regex Pattern: %s\n", linkRegexPattern)
|
logger.Printf("Link Regex Pattern: %s\n", linkRegexPattern)
|
||||||
fmt.Printf("Maximale Tiefe: %d\n", maxDepth)
|
logger.Printf("Maximale Tiefe: %d\n", maxDepth)
|
||||||
|
|
||||||
toVisit := make([]struct {
|
toVisit := make([]struct {
|
||||||
url string
|
url string
|
||||||
|
@ -76,15 +92,15 @@ func main() {
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractLinksFromPage(browser *rod.Browser, url string, depth int) []string {
|
func extractLinksFromPage(browser *rod.Browser, url string, depth int) []string {
|
||||||
fmt.Printf("\nVerarbeite URL: %s (Tiefe: %d)\n", url, depth)
|
logger.Printf("\nVerarbeite URL: %s (Tiefe: %d)\n", url, depth)
|
||||||
|
|
||||||
if depth > maxDepth {
|
if depth > maxDepth {
|
||||||
fmt.Printf("Maximale Tiefe erreicht für URL: %s\n", url)
|
logger.Printf("Maximale Tiefe erreicht für URL: %s\n", url)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if visitedURLs[url] {
|
if visitedURLs[url] {
|
||||||
fmt.Printf("URL bereits besucht: %s\n", url)
|
logger.Printf("URL bereits besucht: %s\n", url)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -94,41 +110,37 @@ func extractLinksFromPage(browser *rod.Browser, url string, depth int) []string
|
||||||
defer page.MustClose()
|
defer page.MustClose()
|
||||||
page.MustWaitLoad()
|
page.MustWaitLoad()
|
||||||
|
|
||||||
fmt.Printf("Seite geladen: %s\n", url)
|
logger.Printf("Seite geladen: %s\n", url)
|
||||||
|
|
||||||
bar := progressbar.Default(100)
|
|
||||||
|
|
||||||
var newLinks []string
|
var newLinks []string
|
||||||
|
|
||||||
mainLinks := extractLinks(page, url)
|
mainLinks := extractLinks(page, url)
|
||||||
newLinks = append(newLinks, mainLinks...)
|
newLinks = append(newLinks, mainLinks...)
|
||||||
bar.Add(50)
|
|
||||||
|
|
||||||
iframeLinks := processNestedIframes(page, url)
|
iframeLinks := processNestedIframes(page, url)
|
||||||
newLinks = append(newLinks, iframeLinks...)
|
newLinks = append(newLinks, iframeLinks...)
|
||||||
bar.Add(50)
|
|
||||||
|
|
||||||
return newLinks
|
return newLinks
|
||||||
}
|
}
|
||||||
|
|
||||||
func processNestedIframes(page *rod.Page, sourceURL string) []string {
|
func processNestedIframes(page *rod.Page, sourceURL string) []string {
|
||||||
fmt.Printf("Suche nach äußerem iFrame auf %s\n", sourceURL)
|
logger.Printf("Suche nach äußerem iFrame auf %s\n", sourceURL)
|
||||||
|
|
||||||
outerIframeElement := page.MustElement("#editorcontainer > iframe:nth-child(1)")
|
outerIframeElement := page.MustElement("#editorcontainer > iframe:nth-child(1)")
|
||||||
outerFrame := outerIframeElement.MustFrame()
|
outerFrame := outerIframeElement.MustFrame()
|
||||||
outerFrame.MustWaitLoad()
|
outerFrame.MustWaitLoad()
|
||||||
|
|
||||||
fmt.Printf("Äußeres iFrame geladen auf %s\n", sourceURL)
|
logger.Printf("Äußeres iFrame geladen auf %s\n", sourceURL)
|
||||||
|
|
||||||
outerLinks := extractLinks(outerFrame, sourceURL+" (äußeres iFrame)")
|
outerLinks := extractLinks(outerFrame, sourceURL+" (äußeres iFrame)")
|
||||||
|
|
||||||
fmt.Printf("Suche nach innerem iFrame auf %s\n", sourceURL)
|
logger.Printf("Suche nach innerem iFrame auf %s\n", sourceURL)
|
||||||
|
|
||||||
innerIframeElement := outerFrame.MustElement("#outerdocbody > iframe:nth-child(1)")
|
innerIframeElement := outerFrame.MustElement("#outerdocbody > iframe:nth-child(1)")
|
||||||
innerFrame := innerIframeElement.MustFrame()
|
innerFrame := innerIframeElement.MustFrame()
|
||||||
innerFrame.MustWaitLoad()
|
innerFrame.MustWaitLoad()
|
||||||
|
|
||||||
fmt.Printf("Inneres iFrame geladen auf %s\n", sourceURL)
|
logger.Printf("Inneres iFrame geladen auf %s\n", sourceURL)
|
||||||
|
|
||||||
innerLinks := extractLinks(innerFrame, sourceURL+" (inneres iFrame)")
|
innerLinks := extractLinks(innerFrame, sourceURL+" (inneres iFrame)")
|
||||||
|
|
||||||
|
@ -141,13 +153,13 @@ func extractLinks(page *rod.Page, sourceURL string) []string {
|
||||||
re := regexp.MustCompile(linkRegexPattern)
|
re := regexp.MustCompile(linkRegexPattern)
|
||||||
links := re.FindAllString(text, -1)
|
links := re.FindAllString(text, -1)
|
||||||
|
|
||||||
fmt.Printf("Gefundene Links auf %s: %d\n", sourceURL, len(links))
|
logger.Printf("Gefundene Links auf %s: %d\n", sourceURL, len(links))
|
||||||
|
|
||||||
var newLinks []string
|
var newLinks []string
|
||||||
for _, link := range links {
|
for _, link := range links {
|
||||||
if !allLinks[link] {
|
if !allLinks[link] {
|
||||||
allLinks[link] = true
|
allLinks[link] = true
|
||||||
fmt.Printf("Neuer Link gefunden: %s\n", link)
|
logger.Printf("Neuer Link gefunden: %s\n", link)
|
||||||
newLinks = append(newLinks, link)
|
newLinks = append(newLinks, link)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue