Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
Jakob Dhondt committed Jun 22, 2022
1 parent 712993b commit 8304408
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 33 deletions.
46 changes: 46 additions & 0 deletions concerts-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -483,3 +483,49 @@ scrapers:
layout: ["15Uhr04"]
date_location: "Europe/Berlin"
date_language: "de_DE"

- name: Komplex457
url: "https://komplex-457.ch/event/"
item: ".portfolio"
fields:
static:
- name: "location"
value: "Komplex457"
- name: "city"
value: "Zurich"
- name: "type"
value: "concert"
- name: "sourceUrl"
value: "https://komplex-457.ch/event/"
dynamic:
- name: "title"
location:
selector: ".av-masonry-entry-title"
- name: "url"
type: "url"
location:
selector: ""
- name: "date"
type: "date"
on_subpage: "url"
components:
- covers:
day: true
month: true
year: true
location:
selector: ".iconbox_content_container p"
node_index: 0
entire_subtree: true
regex_extract:
exp: "[0-9]{2}\\.[0-9]{2}\\.[0-9]{4}"
layout: ["02.01.2006"]
- covers:
time: true
location:
selector: ".iconbox_content_container p"
node_index: 3
regex_extract:
exp: "[0-9]{1,2}:[0-9]{2}"
layout: ["15:04"]
date_location: "Europe/Berlin"
87 changes: 54 additions & 33 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package scraper

import (
"bytes"
"errors"
"fmt"
"log"
Expand Down Expand Up @@ -63,12 +64,13 @@ type RegexConfig struct {

// ElementLocation is used to find a specific string in a html document
type ElementLocation struct {
Selector string `yaml:"selector"`
NodeIndex int `yaml:"node_index"`
ChildIndex int `yaml:"child_index"`
RegexExtract RegexConfig `yaml:"regex_extract"`
Attr string `yaml:"attr"`
MaxLength int `yaml:"max_length"`
Selector string `yaml:"selector"`
NodeIndex int `yaml:"node_index"`
ChildIndex int `yaml:"child_index"`
RegexExtract RegexConfig `yaml:"regex_extract"`
Attr string `yaml:"attr"`
MaxLength int `yaml:"max_length"`
EntireSubtree bool `yaml:"entire_subtree"`
}

// CoveredDateParts is used to determine what parts of a date a
Expand Down Expand Up @@ -486,46 +488,65 @@ func getTextString(t *ElementLocation, s *goquery.Selection) (string, error) {
fieldSelection := s.Find(t.Selector)
if len(fieldSelection.Nodes) > t.NodeIndex {
if t.Attr == "" {
fieldNode := fieldSelection.Get(t.NodeIndex).FirstChild
currentChildIndex := 0
for fieldNode != nil {
// for the case where we want to find the correct string
// by regex (checking all the children and taking the first one that matches the regex)
// the ChildIndex has to be set to -1 to
// distinguish from the default case 0. So when we explicitly set ChildIndex to -1 it means
// check _all_ of the children.
if currentChildIndex == t.ChildIndex || t.ChildIndex == -1 {
if fieldNode.Type == html.TextNode {
fieldString, err = extractStringRegex(&t.RegexExtract, fieldNode.Data)
if err == nil {
if t.MaxLength > 0 && t.MaxLength < len(fieldString) {
fieldString = fieldString[:t.MaxLength] + "..."
if t.EntireSubtree {
// copied from https://github.com/PuerkitoBio/goquery/blob/v1.8.0/property.go#L62
var buf bytes.Buffer
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.TextNode {
// Keep newlines and spaces, like jQuery
buf.WriteString(n.Data)
}
if n.FirstChild != nil {
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
}
f(fieldSelection.Get(t.NodeIndex))
fieldString = buf.String()
} else {
fieldNode := fieldSelection.Get(t.NodeIndex).FirstChild
currentChildIndex := 0
for fieldNode != nil {
// for the case where we want to find the correct string
// by regex (checking all the children and taking the first one that matches the regex)
// the ChildIndex has to be set to -1 to
// distinguish from the default case 0. So when we explicitly set ChildIndex to -1 it means
// check _all_ of the children.
if currentChildIndex == t.ChildIndex || t.ChildIndex == -1 {
if fieldNode.Type == html.TextNode {
fieldString, err = extractStringRegex(&t.RegexExtract, fieldNode.Data)
if err == nil {
break
} else if t.ChildIndex != -1 {
// only in case we do not (ab)use the regex to search across all children
// we want to return the err. Also, we still return the fieldString as
// this might be useful for narrowing down the reason for the error.
return fieldString, err
}
break
} else if t.ChildIndex != -1 {
// only in case we do not (ab)use the regex to search across all children
// we want to return the err. Also, we still return the fieldString as
// this might be useful for narrowing down the reason for the error.
return fieldString, err
}
}
fieldNode = fieldNode.NextSibling
currentChildIndex++
}
fieldNode = fieldNode.NextSibling
currentChildIndex++
}
} else {
// WRONG
// It could be the case that there are multiple nodes that match the selector
// and we don't want the attr of the first node...
fieldString = fieldSelection.AttrOr(t.Attr, "")
fieldString, err = extractStringRegex(&t.RegexExtract, fieldString)
if err != nil {
return fieldString, err
}
}
}
// automitcally trimming whitespaces might be confusing in some cases...
// automatically trimming whitespaces might be confusing in some cases...
fieldString = strings.TrimSpace(fieldString)
fieldString, err = extractStringRegex(&t.RegexExtract, fieldString)
if err != nil {
return fieldString, err
}
if t.MaxLength > 0 && t.MaxLength < len(fieldString) {
fieldString = fieldString[:t.MaxLength] + "..."
}
return fieldString, nil
}

Expand Down

0 comments on commit 8304408

Please sign in to comment.