Skip to content

Commit

Permalink
Merge pull request #298 from jakopako:jakopako/issue255
Browse files Browse the repository at this point in the history
better debugging files/logs + allow LIST of interactions
  • Loading branch information
jakopako authored May 19, 2024
2 parents 61fd416 + b78231a commit 480b3f2
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 56 deletions.
5 changes: 5 additions & 0 deletions config/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package config

var (
Debug bool
)
80 changes: 51 additions & 29 deletions fetch/fetcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,20 @@ import (
"io"
"log/slog"
"net/http"
"net/url"
"os"
"time"

"github.com/chromedp/cdproto/cdp"
"github.com/chromedp/cdproto/dom"
"github.com/chromedp/chromedp"
"github.com/jakopako/goskyr/config"
"github.com/jakopako/goskyr/types"
"github.com/jakopako/goskyr/utils"
)

type FetchOpts struct {
Interaction types.Interaction
Interaction []*types.Interaction
}

// A Fetcher allows to fetch the content of a web page
Expand Down Expand Up @@ -90,8 +94,8 @@ func (d *DynamicFetcher) Cancel() {
d.cancelAlloc()
}

func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) {
logger := slog.With(slog.String("fetcher", "dynamic"), slog.String("url", url))
func (d *DynamicFetcher) Fetch(urlStr string, opts FetchOpts) (string, error) {
logger := slog.With(slog.String("fetcher", "dynamic"), slog.String("url", urlStr))
logger.Debug("fetching page", slog.String("user-agent", d.UserAgent))
// start := time.Now()
ctx, cancel := chromedp.NewContext(d.allocContext)
Expand All @@ -104,36 +108,37 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) {
var body string
sleepTime := time.Duration(d.WaitMilliseconds) * time.Millisecond
actions := []chromedp.Action{
chromedp.Navigate(url),
chromedp.Navigate(urlStr),
chromedp.Sleep(sleepTime),
}
logger.Debug(fmt.Sprintf("appended chrome actions: Navigate, Sleep(%v)", sleepTime))
delay := 500 * time.Millisecond // default is .5 seconds
if opts.Interaction.Delay > 0 {
delay = time.Duration(opts.Interaction.Delay) * time.Millisecond
}
if opts.Interaction.Type == types.InteractionTypeClick {
count := 1 // default is 1
if opts.Interaction.Count > 0 {
count = opts.Interaction.Count
for j, ia := range opts.Interaction {
logger.Debug(fmt.Sprintf("processing interaction nr %d, type %s", j, ia.Type))
delay := 500 * time.Millisecond // default is .5 seconds
if ia.Delay > 0 {
delay = time.Duration(ia.Delay) * time.Millisecond
}
for i := 0; i < count; i++ {
// we only click the button if it exists. Do we really need this check here?
// TODO: should we click as many times as possible if count == 0? How would we implement this?
// actions = append(actions, chromedp.Click(d.Interaction.Selector, chromedp.ByQuery))
actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error {
var nodes []*cdp.Node
if err := chromedp.Nodes(opts.Interaction.Selector, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil {
return err
}
if len(nodes) == 0 {
return nil
} // nothing to do
logger.Debug(fmt.Sprintf("clicking on node with selector: %s", opts.Interaction.Selector))
return chromedp.MouseClickNode(nodes[0]).Do(ctx)
}))
actions = append(actions, chromedp.Sleep(delay))
logger.Debug(fmt.Sprintf("appended chrome actions: ActionFunc, Sleep(%v)", delay))
if ia.Type == types.InteractionTypeClick {
count := 1 // default is 1
if ia.Count > 0 {
count = ia.Count
}
for i := 0; i < count; i++ {
// we only click the button if it exists. Do we really need this check here?
actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error {
var nodes []*cdp.Node
if err := chromedp.Nodes(ia.Selector, &nodes, chromedp.AtLeast(0)).Do(ctx); err != nil {
return err
}
if len(nodes) == 0 {
return nil
} // nothing to do
logger.Debug(fmt.Sprintf("clicking on node with selector: %s", ia.Selector))
return chromedp.MouseClickNode(nodes[0]).Do(ctx)
}))
actions = append(actions, chromedp.Sleep(delay))
logger.Debug(fmt.Sprintf("appended chrome actions: ActionFunc (mouse click), Sleep(%v)", delay))
}
}
}
actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error {
Expand All @@ -145,6 +150,23 @@ func (d *DynamicFetcher) Fetch(url string, opts FetchOpts) (string, error) {
return err
}))

if config.Debug {
u, _ := url.Parse(urlStr)
var buf []byte
r, err := utils.RandomString(u.Host)
if err != nil {
return "", err
}
filename := fmt.Sprintf("%s.png", r)
actions = append(actions, chromedp.CaptureScreenshot(&buf))
actions = append(actions, chromedp.ActionFunc(func(ctx context.Context) error {
// log.Printf("Write %v", fileName)
logger.Debug(fmt.Sprintf("writing screenshot to file %s", filename))
return os.WriteFile(filename, buf, 0644)
}))
logger.Debug("appended chrome actions: CaptureScreenshot, ActionFunc (save screenshot)")
}

// run task list
err := chromedp.Run(ctx,
actions...,
Expand Down
4 changes: 3 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"sync"

"github.com/jakopako/goskyr/autoconfig"
"github.com/jakopako/goskyr/config"
"github.com/jakopako/goskyr/ml"
"github.com/jakopako/goskyr/output"
"github.com/jakopako/goskyr/scraper"
Expand Down Expand Up @@ -65,6 +66,7 @@ func main() {
return
}

config.Debug = *debugFlag
var logLevel slog.Level
if *debugFlag {
logLevel = slog.LevelDebug
Expand Down Expand Up @@ -170,7 +172,7 @@ func main() {
go func() {
for _, s := range config.Scrapers {
if *singleScraper == "" || *singleScraper == s.Name {
s.Debug = *debugFlag
// s.Debug = *debugFlag
sc <- s
}
}
Expand Down
53 changes: 27 additions & 26 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package scraper

import (
"bytes"
"crypto/rand"
"errors"
"fmt"
"io/fs"
Expand All @@ -19,6 +18,7 @@ import (
"github.com/antchfx/jsonquery"
"github.com/goodsign/monday"
"github.com/ilyakaznacheev/cleanenv"
"github.com/jakopako/goskyr/config"
"github.com/jakopako/goskyr/date"
"github.com/jakopako/goskyr/fetch"
"github.com/jakopako/goskyr/output"
Expand Down Expand Up @@ -236,17 +236,16 @@ type Paginator struct {
// A Scraper contains all the necessary config parameters and structs needed
// to extract the desired information from a website
type Scraper struct {
Name string `yaml:"name"`
URL string `yaml:"url"`
Item string `yaml:"item"`
Fields []Field `yaml:"fields,omitempty"`
Filters []*Filter `yaml:"filters,omitempty"`
Paginator Paginator `yaml:"paginator,omitempty"`
RenderJs bool `yaml:"render_js,omitempty"`
PageLoadWait int `yaml:"page_load_wait,omitempty"` // milliseconds. Only taken into account when render_js = true
Interaction types.Interaction `yaml:"interaction,omitempty"`
Name string `yaml:"name"`
URL string `yaml:"url"`
Item string `yaml:"item"`
Fields []Field `yaml:"fields,omitempty"`
Filters []*Filter `yaml:"filters,omitempty"`
Paginator Paginator `yaml:"paginator,omitempty"`
RenderJs bool `yaml:"render_js,omitempty"`
PageLoadWait int `yaml:"page_load_wait,omitempty"` // milliseconds. Only taken into account when render_js = true
Interaction []*types.Interaction `yaml:"interaction,omitempty"`
fetcher fetch.Fetcher
Debug bool `yaml:"debug,omitempty"`
}

// GetItems fetches and returns all items from a website according to the
Expand Down Expand Up @@ -280,7 +279,7 @@ func (c Scraper) GetItems(globalConfig *GlobalConfig, rawDyn bool) ([]map[string
currentPage := 0
var doc *goquery.Document

hasNextPage, pageURL, doc, err := c.fetchPage(nil, currentPage, c.URL, globalConfig.UserAgent, &c.Interaction)
hasNextPage, pageURL, doc, err := c.fetchPage(nil, currentPage, c.URL, globalConfig.UserAgent, c.Interaction)
if err != nil {
return items, err
}
Expand Down Expand Up @@ -477,10 +476,10 @@ func (c *Scraper) removeHiddenFields(item map[string]interface{}) map[string]int
return item
}

func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl, userAgent string, i *types.Interaction) (bool, string, *goquery.Document, error) {
func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl, userAgent string, i []*types.Interaction) (bool, string, *goquery.Document, error) {

if nextPageI == 0 {
newDoc, err := c.fetchToDoc(currentPageUrl, fetch.FetchOpts{Interaction: *i})
newDoc, err := c.fetchToDoc(currentPageUrl, fetch.FetchOpts{Interaction: i})
if err != nil {
return false, "", nil, err
}
Expand All @@ -492,10 +491,12 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl
pagSelector := doc.Find(c.Paginator.Location.Selector)
if len(pagSelector.Nodes) > 0 {
if nextPageI < c.Paginator.MaxPages || c.Paginator.MaxPages == 0 {
ia := types.Interaction{
Selector: c.Paginator.Location.Selector,
Type: types.InteractionTypeClick,
Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page
ia := []*types.Interaction{
{
Selector: c.Paginator.Location.Selector,
Type: types.InteractionTypeClick,
Count: nextPageI, // we always need to 'restart' the clicks because we always re-fetch the page
},
}
nextPageDoc, err := c.fetchToDoc(currentPageUrl, fetch.FetchOpts{Interaction: ia})
if err != nil {
Expand Down Expand Up @@ -525,8 +526,8 @@ func (c *Scraper) fetchPage(doc *goquery.Document, nextPageI int, currentPageUrl
}
}

func (c *Scraper) fetchToDoc(url string, opts fetch.FetchOpts) (*goquery.Document, error) {
res, err := c.fetcher.Fetch(url, opts)
func (c *Scraper) fetchToDoc(urlStr string, opts fetch.FetchOpts) (*goquery.Document, error) {
res, err := c.fetcher.Fetch(urlStr, opts)
if err != nil {
return nil, err
}
Expand All @@ -537,14 +538,14 @@ func (c *Scraper) fetchToDoc(url string, opts fetch.FetchOpts) (*goquery.Documen
}

// in debug mode we want to write all the html's to files
if c.Debug {
bs := make([]byte, 8)
_, err := rand.Read(bs)
if config.Debug {
u, _ := url.Parse(urlStr)
r, err := utils.RandomString(u.Host)
if err != nil {
return nil, fmt.Errorf("failed to generate random bytes for html file name")
return nil, err
}
filename := fmt.Sprintf("%s-%x.html", c.Name, bs[:8])
slog.Debug(fmt.Sprintf("writing html to file %s", filename), slog.String("url", url))
filename := fmt.Sprintf("%s.html", r)
slog.Debug(fmt.Sprintf("writing html to file %s", filename), slog.String("url", urlStr))
htmlStr, err := goquery.OuterHtml(doc.Children())
if err != nil {
return nil, fmt.Errorf("failed to write html file: %v", err)
Expand Down
10 changes: 10 additions & 0 deletions utils/utils.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package utils

import (
"crypto/rand"
"fmt"
"math"
"sort"
Expand Down Expand Up @@ -142,3 +143,12 @@ func ReverseSlice[T any](s []T) {
s[i], s[j] = s[j], s[i]
}
}

func RandomString(base string) (string, error) {
bs := make([]byte, 8)
_, err := rand.Read(bs)
if err != nil {
return "", fmt.Errorf("failed to generate random bytes: %v", err)
}
return fmt.Sprintf("%s-%x", base, bs[:8]), nil
}

0 comments on commit 480b3f2

Please sign in to comment.