Let's crawl some shit

This commit is contained in:
akp 2023-11-04 20:10:51 +00:00
parent dc8d1a8834
commit c71748fc48
No known key found for this signature in database
GPG key ID: CF8D58F3DEB20755
2 changed files with 29 additions and 12 deletions

View file

@ -22,7 +22,7 @@ import (
"time"
)
func (c *CrawlCore) Loop(stop chan struct{}) error {
func (c *CrawlCore) Loop(stop chan os.Signal) error {
jobs := make(chan *database.Site)
defer func() {
@ -45,12 +45,10 @@ func (c *CrawlCore) Loop(stop chan struct{}) error {
mainLoop:
for {
if stop != nil {
select {
case <-stop:
break mainLoop
default:
}
select {
case <-stop:
break mainLoop
default:
}
tx, err := c.DB.BeginTx(context.Background(), nil)
@ -84,9 +82,16 @@ mainLoop:
return util.Wrap("commit crawl loop transaction", err)
}
jobs <- site
select {
case jobs <- site:
case <-stop:
break mainLoop
}
}
slog.Info("Gracefully shutting down")
return nil
}
@ -111,7 +116,7 @@ func (c *CrawlCore) worker(workerID int, jobChan chan *database.Site) {
currPageNumber := 0
if site.StartURL == "" {
site.StartURL = "http://" + site.Domain + "/"
site.StartURL = "https://" + site.Domain + "/"
}
queuedURLs := map[string]struct{}{
@ -135,10 +140,11 @@ func (c *CrawlCore) worker(workerID int, jobChan chan *database.Site) {
urlQueue = slices.Delete(urlQueue, 0, 1)
}
log.Info("get page", "n", currPageNumber, "url", currentURL)
//log.Info("get page", "n", currPageNumber, "url", currentURL)
// Get page
var pageBody string
headers := make(map[string][]string)
ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
err := requests.URL(currentURL).ToString(&pageBody).UserAgent(conf.UserAgent).AddValidator(func(r *http.Response) error {
@ -150,7 +156,7 @@ func (c *CrawlCore) worker(workerID int, jobChan chan *database.Site) {
return continuePageLoop
}
return nil
}).Fetch(ctx)
}).CopyHeaders(headers).Fetch(ctx)
cancel()
if err != nil {
@ -163,6 +169,13 @@ func (c *CrawlCore) worker(workerID int, jobChan chan *database.Site) {
break pageLoop
}
{
x := headers["Content-Type"]
if len(x) < 1 || !strings.HasPrefix(strings.ToLower(x[0]), "text/html") {
continue pageLoop
}
}
// Extract links
doc, err := goquery.NewDocumentFromReader(bytes.NewReader([]byte(pageBody)))
if err != nil {

View file

@ -8,6 +8,8 @@ import (
"git.tdpain.net/codemicro/hn84/util"
"log/slog"
"os"
"os/signal"
"syscall"
)
func main() {
@ -41,7 +43,9 @@ func run() error {
return util.Wrap("add site", err)
}
case "run":
if err := cc.Loop(nil); err != nil {
ch := make(chan os.Signal)
signal.Notify(ch, syscall.SIGINT)
if err := cc.Loop(ch); err != nil {
return util.Wrap("run crawl loop", err)
}
default: