From d4b8853a2b03e924abce4b29ee0224f9d402d925 Mon Sep 17 00:00:00 2001 From: jakopako Date: Sun, 6 Mar 2022 05:30:33 +0100 Subject: [PATCH] fixed some smaller issues --- go.mod | 6 ++- go.sum | 15 ++++++ main.go | 12 ++--- output/api.go | 17 ++++--- output/stdout.go | 5 +- scraper/scraper.go | 116 ++++++++++++++++++++++++++------------------- 6 files changed, 105 insertions(+), 66 deletions(-) diff --git a/go.mod b/go.mod index af7eaa7..f52c8b8 100644 --- a/go.mod +++ b/go.mod @@ -9,4 +9,8 @@ require ( gopkg.in/yaml.v2 v2.4.0 ) -require github.com/andybalholm/cascadia v1.3.1 // indirect +require ( + github.com/andybalholm/cascadia v1.3.1 // indirect + golang.org/x/lint v0.0.0-20210508222113-6edffad5e616 // indirect + golang.org/x/tools v0.1.9 // indirect +) diff --git a/go.sum b/go.sum index 00bb964..3701c49 100644 --- a/go.sum +++ b/go.sum @@ -4,18 +4,33 @@ github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x0 github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/goodsign/monday v1.0.0 h1:Yyk/s/WgudMbAJN6UWSU5xAs8jtNewfqtVblAlw0yoc= github.com/goodsign/monday v1.0.0/go.mod h1:r4T4breXpoFwspQNM+u2sLxJb2zyTaxVGqUfTBjWOu8= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/lint v0.0.0-20210508222113-6edffad5e616 h1:VLliZ0d+/avPrXXH+OakdXhpJuEoBZuwh1m2j7U6Iug= +golang.org/x/lint v0.0.0-20210508222113-6edffad5e616/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= +golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220225172249-27dd8689420f h1:oA4XRj0qtSt8Yo1Zms0CUlsT3KG69V2UGQWPBxujDmc= golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.1.9 h1:j9KsMiaP1c3B0OTQGth0/k+miLGTgLsAFUCrF2vLcF8= +golang.org/x/tools v0.1.9/go.mod h1:nABZi5QlRsZVlzPpHl034qft6wpY4eDcsTt5AaioBiU= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= diff --git a/main.go b/main.go index 5d2c01e..1e8b0b0 100644 --- a/main.go +++ b/main.go @@ -11,7 +11,7 @@ import ( "gopkg.in/yaml.v2" ) -func NewConfig(configPath string) (*scraper.Config, error) { +func newConfig(configPath string) (*scraper.Config, error) { config := &scraper.Config{} file, err := os.Open(configPath) if err != nil { @@ -32,7 +32,7 @@ func main() { flag.Parse() - config, err := NewConfig(*configFile) + config, err := newConfig(*configFile) if err != nil { log.Fatal(err) } @@ -44,18 +44,18 @@ func main() { if *singleScraper == s.Name { wg.Add(1) if *storeData { - output.WriteEventsToAPI(&wg, s) + output.WriteItemsToAPI(&wg, s) } else { - output.PrettyPrintEvents(&wg, s) + output.PrettyPrintItems(&wg, s) } break } } else { wg.Add(1) if *storeData { - go output.WriteEventsToAPI(&wg, s) + go output.WriteItemsToAPI(&wg, s) } else { - go output.PrettyPrintEvents(&wg, s) + go output.PrettyPrintItems(&wg, s) } } } diff --git a/output/api.go b/output/api.go index 0440fe3..59f5040 100644 --- a/output/api.go +++ b/output/api.go @@ -15,16 +15,19 @@ import ( "github.com/jakopako/goskyr/scraper" ) -func WriteEventsToAPI(wg *sync.WaitGroup, c scraper.Scraper) { +// WriteItemsToAPI writes the scraped events to an API defined through +// env vars. This method is not really useful because it is tailored to +// one specific API. Might change in the future. +func WriteItemsToAPI(wg *sync.WaitGroup, c scraper.Scraper) { // This function is not yet documented in the README because it might soon change and the entire result / output handling // might be refactored / improved. log.Printf("crawling %s\n", c.Name) defer wg.Done() - apiUrl := os.Getenv("EVENT_API") + apiURL := os.Getenv("EVENT_API") client := &http.Client{ Timeout: time.Second * 10, } - events, err := c.GetEvents() + events, err := c.GetItems() if err != nil { log.Printf("%s ERROR: %s", c.Name, err) @@ -40,8 +43,8 @@ func WriteEventsToAPI(wg *sync.WaitGroup, c scraper.Scraper) { // delete events of this scraper from first date on firstDate := events[0]["date"].(time.Time).UTC().Format("2006-01-02 15:04") - deleteUrl := fmt.Sprintf("%s?location=%s&datetime=%s", apiUrl, url.QueryEscape(c.Name), url.QueryEscape(firstDate)) - req, _ := http.NewRequest("DELETE", deleteUrl, nil) + deleteURL := fmt.Sprintf("%s?location=%s&datetime=%s", apiURL, url.QueryEscape(c.Name), url.QueryEscape(firstDate)) + req, _ := http.NewRequest("DELETE", deleteURL, nil) req.SetBasicAuth(os.Getenv("API_USER"), os.Getenv("API_PASSWORD")) resp, err := client.Do(req) if err != nil { @@ -53,7 +56,7 @@ func WriteEventsToAPI(wg *sync.WaitGroup, c scraper.Scraper) { if err != nil { log.Fatal(err) } - log.Fatalf("Something went wrong while deleting events. Status Code: %d\nUrl: %s Response: %s", resp.StatusCode, deleteUrl, body) + log.Fatalf("Something went wrong while deleting events. Status Code: %d\nUrl: %s Response: %s", resp.StatusCode, deleteURL, body) } // add new events @@ -62,7 +65,7 @@ func WriteEventsToAPI(wg *sync.WaitGroup, c scraper.Scraper) { if err != nil { log.Fatal(err) } - req, _ := http.NewRequest("POST", apiUrl, bytes.NewBuffer(concertJSON)) + req, _ := http.NewRequest("POST", apiURL, bytes.NewBuffer(concertJSON)) req.Header = map[string][]string{ "Content-Type": {"application/json"}, } diff --git a/output/stdout.go b/output/stdout.go index aaa4303..77f0f4f 100644 --- a/output/stdout.go +++ b/output/stdout.go @@ -10,9 +10,10 @@ import ( "github.com/jakopako/goskyr/scraper" ) -func PrettyPrintEvents(wg *sync.WaitGroup, c scraper.Scraper) { +// PrettyPrintItems pretty prints the scraped items +func PrettyPrintItems(wg *sync.WaitGroup, c scraper.Scraper) { defer wg.Done() - events, err := c.GetEvents() + events, err := c.GetItems() if err != nil { log.Printf("%s ERROR: %s", c.Name, err) return diff --git a/scraper/scraper.go b/scraper/scraper.go index 1dda8c8..1ddd50d 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -15,15 +15,20 @@ import ( "golang.org/x/net/html" ) +// Config defines the overall structure of a scraper which is also the structure of the +// configuration file for the scraper. type Config struct { Scrapers []Scraper `yaml:"scrapers"` } +// RegexConfig is used for extracting a substring from a string based on the +// given Exp and Index type RegexConfig struct { Exp string `yaml:"exp"` Index int `yaml:"index"` } +// ElementLocation is used to find a specific string in a html document type ElementLocation struct { Selector string `yaml:"selector"` NodeIndex int `yaml:"node_index"` @@ -33,6 +38,8 @@ type ElementLocation struct { MaxLength int `yaml:"max_length"` } +// CoveredDateParts is used to determine what parts of a date a +// DateComponent covers type CoveredDateParts struct { Day bool `yaml:"day"` Month bool `yaml:"month"` @@ -40,17 +47,24 @@ type CoveredDateParts struct { Time bool `yaml:"time"` } +// A DateComponent is used to find a specific part of a date within +// a html document type DateComponent struct { Covers CoveredDateParts `yaml:"covers"` ElementLocation ElementLocation `yaml:"location"` Layout string `yaml:"layout"` } +// A StaticField defines a field that has a fixed name and value +// across all scraped items type StaticField struct { Name string `yaml:"name"` Value string `yaml:"value"` } +// A DynamicField contains all the information necessary to scrape +// a dynamic field from a website, ie a field who's value changes +// for each item type DynamicField struct { Name string `yaml:"name"` Type string `yaml:"type"` // can currently be text, url or date @@ -65,11 +79,14 @@ type DynamicField struct { Relative bool `yaml:"relative"` // applies to url } +// A Filter is used to filter certain items from the result list type Filter struct { Field string `yaml:"field"` RegexIgnore string `yaml:"regex_ignore"` } +// A Scraper contains all the necessary config parameters and structs needed +// to extract the desired information from a website type Scraper struct { Name string `yaml:"name"` URL string `yaml:"url"` @@ -88,29 +105,30 @@ type Scraper struct { } } -func (c Scraper) GetEvents() ([]map[string]interface{}, error) { +// GetItems fetches and returns all items from a website according to the +// Scraper's paramaters +func (c Scraper) GetItems() ([]map[string]interface{}, error) { - var events []map[string]interface{} + var items []map[string]interface{} - pageUrl := c.URL + pageURL := c.URL hasNextPage := true currentPage := 0 for hasNextPage { - res, err := http.Get(pageUrl) + res, err := http.Get(pageURL) if err != nil { - return events, err + return items, err } // defer res.Body.Close() // better not defer in a for loop if res.StatusCode != 200 { - errMsg := fmt.Sprintf("status code error: %d %s", res.StatusCode, res.Status) - return events, errors.New(errMsg) + return items, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status) } doc, err := goquery.NewDocumentFromReader(res.Body) if err != nil { - return events, err + return items, err } doc.Find(c.Item).Each(func(i int, s *goquery.Selection) { @@ -121,17 +139,17 @@ func (c Scraper) GetEvents() ([]map[string]interface{}, error) { } // add static fields - currentEvent := make(map[string]interface{}) + currentItem := make(map[string]interface{}) for _, sf := range c.Fields.Static { - currentEvent[sf.Name] = sf.Value + currentItem[sf.Name] = sf.Value } // handle all fields on the main page for _, f := range c.Fields.Dynamic { if f.OnSubpage == "" { - err := extractField(&f, currentEvent, s, c.URL, res) + err := extractField(&f, currentItem, s, c.URL, res) if err != nil { - log.Printf("%s ERROR: error while parsing field %s: %v. Skipping event %v.", c.Name, f.Name, err, currentEvent) + log.Printf("%s ERROR: error while parsing field %s: %v. Skipping item %v.", c.Name, f.Name, err, currentItem) return } } @@ -148,30 +166,30 @@ func (c Scraper) GetEvents() ([]map[string]interface{}, error) { for _, f := range c.Fields.Dynamic { if f.OnSubpage != "" { // check whether we fetched the page already - subpageUrl := fmt.Sprint(currentEvent[f.OnSubpage]) - resSub, found := subpagesResp[subpageUrl] + subpageURL := fmt.Sprint(currentItem[f.OnSubpage]) + resSub, found := subpagesResp[subpageURL] if !found { - resSub, err = http.Get(subpageUrl) + resSub, err = http.Get(subpageURL) if err != nil { - log.Printf("%s ERROR: %v. Skipping event %v.", c.Name, err, currentEvent) + log.Printf("%s ERROR: %v. Skipping item %v.", c.Name, err, currentItem) return } if resSub.StatusCode != 200 { - log.Printf("%s ERROR: status code error: %d %s. Skipping event %v.", c.Name, res.StatusCode, res.Status, currentEvent) + log.Printf("%s ERROR: status code error: %d %s. Skipping item %v.", c.Name, res.StatusCode, res.Status, currentItem) return } - subpagesResp[subpageUrl] = resSub + subpagesResp[subpageURL] = resSub docSub, err := goquery.NewDocumentFromReader(resSub.Body) if err != nil { - log.Printf("%s ERROR: error while reading document: %v. Skipping event %v", c.Name, err, currentEvent) + log.Printf("%s ERROR: error while reading document: %v. Skipping item %v", c.Name, err, currentItem) return } - subpagesBody[subpageUrl] = docSub + subpagesBody[subpageURL] = docSub } - err = extractField(&f, currentEvent, subpagesBody[subpageUrl].Selection, c.URL, resSub) + err = extractField(&f, currentItem, subpagesBody[subpageURL].Selection, c.URL, resSub) if err != nil { - log.Printf("%s ERROR: error while parsing field %s: %v. Skipping event %v.", c.Name, f.Name, err, currentEvent) + log.Printf("%s ERROR: error while parsing field %s: %v. Skipping item %v.", c.Name, f.Name, err, currentItem) return } } @@ -182,36 +200,36 @@ func (c Scraper) GetEvents() ([]map[string]interface{}, error) { } // check if event should be ignored - ie, err := c.ignoreEvent(currentEvent) + ie, err := c.ignoreItem(currentItem) if err != nil { - log.Fatalf("%s ERROR: error while applying ignore filter: %v. Not ignoring event %v.", c.Name, err, currentEvent) + log.Fatalf("%s ERROR: error while applying ignore filter: %v. Not ignoring item %v.", c.Name, err, currentItem) } if !ie { - events = append(events, currentEvent) + items = append(items, currentItem) } }) hasNextPage = false if c.Paginator.Selector != "" { - currentPage += 1 + currentPage++ if currentPage < c.Paginator.MaxPages || c.Paginator.MaxPages == 0 { attr := "href" if len(doc.Find(c.Paginator.Selector).Nodes) > c.Paginator.NodeIndex { pagNode := doc.Find(c.Paginator.Selector).Get(c.Paginator.NodeIndex) for _, a := range pagNode.Attr { if a.Key == attr { - nextUrl := a.Val + nextURL := a.Val if c.Paginator.Relative { baseURL := fmt.Sprintf("%s://%s", res.Request.URL.Scheme, res.Request.URL.Host) - if strings.HasPrefix(nextUrl, "?") { - pageUrl = baseURL + res.Request.URL.Path + nextUrl - } else if !strings.HasPrefix(nextUrl, "/") { - pageUrl = baseURL + "/" + nextUrl + if strings.HasPrefix(nextURL, "?") { + pageURL = baseURL + res.Request.URL.Path + nextURL + } else if !strings.HasPrefix(nextURL, "/") { + pageURL = baseURL + "/" + nextURL } else { - pageUrl = baseURL + nextUrl + pageURL = baseURL + nextURL } } else { - pageUrl = nextUrl + pageURL = nextURL } hasNextPage = true } @@ -227,10 +245,10 @@ func (c Scraper) GetEvents() ([]map[string]interface{}, error) { // year of some events because our previous guess was rather naiv. We also might want // to make this functionality optional. See issue #68 - return events, nil + return items, nil } -func (c Scraper) ignoreEvent(event map[string]interface{}) (bool, error) { +func (c Scraper) ignoreItem(event map[string]interface{}) (bool, error) { for _, filter := range c.Filters { regex, err := regexp.Compile(filter.RegexIgnore) if err != nil { @@ -247,7 +265,7 @@ func (c Scraper) ignoreEvent(event map[string]interface{}) (bool, error) { return false, nil } -func extractField(field *DynamicField, event map[string]interface{}, s *goquery.Selection, baseUrl string, res *http.Response) error { +func extractField(field *DynamicField, event map[string]interface{}, s *goquery.Selection, baseURL string, res *http.Response) error { switch field.Type { case "text", "": // the default, ie when type is not configured, is 'text' ts, err := getTextString(&field.ElementLocation, s) @@ -256,13 +274,12 @@ func extractField(field *DynamicField, event map[string]interface{}, s *goquery. } if !field.CanBeEmpty { if ts == "" { - errMsg := fmt.Sprintf("field %s cannot be empty", field.Name) - return errors.New(errMsg) + return fmt.Errorf("field %s cannot be empty", field.Name) } } event[field.Name] = ts case "url": - event[field.Name] = getUrlString(field, s, baseUrl, res) + event[field.Name] = getURLString(field, s, baseURL, res) case "date": d, err := getDate(field, s) if err != nil { @@ -270,13 +287,12 @@ func extractField(field *DynamicField, event map[string]interface{}, s *goquery. } event[field.Name] = d default: - errMsg := fmt.Sprintf("field type '%s' does not exist", field.Type) - return errors.New(errMsg) + return fmt.Errorf("field type '%s' does not exist", field.Type) } return nil } -type DatePart struct { +type datePart struct { stringPart string layoutPart string } @@ -296,7 +312,7 @@ func getDate(f *DynamicField, s *goquery.Selection) (time.Time, error) { } // collect all the date parts - dateParts := []DatePart{} + dateParts := []datePart{} combinedParts := CoveredDateParts{} for _, c := range f.Components { if !hasAllDateParts(combinedParts) { @@ -308,7 +324,7 @@ func getDate(f *DynamicField, s *goquery.Selection) (time.Time, error) { return t, err } if sp != "" { - dateParts = append(dateParts, DatePart{ + dateParts = append(dateParts, datePart{ stringPart: strings.Replace(sp, "p.m.", "pm", 1), layoutPart: strings.Replace(c.Layout, "p.m.", "pm", 1), }) @@ -319,13 +335,13 @@ func getDate(f *DynamicField, s *goquery.Selection) (time.Time, error) { // adding default values where necessary if !combinedParts.Year { currentYear := time.Now().Year() - dateParts = append(dateParts, DatePart{ + dateParts = append(dateParts, datePart{ stringPart: strconv.Itoa(currentYear), layoutPart: "2006", }) } if !combinedParts.Time { - dateParts = append(dateParts, DatePart{ + dateParts = append(dateParts, datePart{ stringPart: "20:00", layoutPart: "15:04", }) @@ -377,16 +393,16 @@ func hasAllDateParts(cdp CoveredDateParts) bool { return cdp.Day && cdp.Month && cdp.Year && cdp.Time } -func getUrlString(f *DynamicField, s *goquery.Selection, scraperUrl string, res *http.Response) string { +func getURLString(f *DynamicField, s *goquery.Selection, scraperURL string, res *http.Response) string { var url string attr := "href" if f.ElementLocation.Attr != "" { attr = f.ElementLocation.Attr } if f.ElementLocation.Selector == "" { - url = s.AttrOr(attr, scraperUrl) + url = s.AttrOr(attr, scraperURL) } else { - url = s.Find(f.ElementLocation.Selector).AttrOr(attr, scraperUrl) + url = s.Find(f.ElementLocation.Selector).AttrOr(attr, scraperURL) } if f.Relative { @@ -430,7 +446,7 @@ func getTextString(t *ElementLocation, s *goquery.Selection) (string, error) { } } fieldNode = fieldNode.NextSibling - currentChildIndex += 1 + currentChildIndex++ } } else { fieldString = fieldSelection.AttrOr(t.Attr, "")